From ba71817390f78bf8c479dc65d1bc51db98d667a7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Felix=20Geisend=C3=B6rfer?= Date: Thu, 9 Mar 2023 07:54:51 +0000 Subject: [PATCH] runtime/trace: enable frame pointer unwinding on amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Change tracer to use frame pointer unwinding by default on amd64. The expansion of inline frames is delayed until the stack table is dumped at the end of the trace. This requires storing the skip argument in the stack table, which now resides in pcBuf[0]. For stacks that are not produced by traceStackID (e.g. CPU samples), a logicalStackSentinel value in pcBuf[0] indicates that no inline expansion is needed. Add new GODEBUG=tracefpunwindoff=1 option to use the old unwinder if needed. Benchmarks show a considerable decrease in CPU overhead when using frame pointer unwinding for trace events: GODEBUG=tracefpunwindoff=1 ../bin/go test -run '^$' -bench '.+PingPong' -count 20 -v -trace /dev/null ./runtime | tee tracefpunwindoff1.txt GODEBUG=tracefpunwindoff=0 ../bin/go test -run '^$' -bench '.+PingPong' -count 20 -v -trace /dev/null ./runtime | tee tracefpunwindoff0.txt goos: linux goarch: amd64 pkg: runtime cpu: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz │ tracefpunwindoff1.txt │ tracefpunwindoff0.txt │ │ sec/op │ sec/op vs base │ PingPongHog-32 3782.5n ± 0% 740.7n ± 2% -80.42% (p=0.000 n=20) For #16638 Change-Id: I2928a2fcd8779a31c45ce0f2fbcc0179641190bb Reviewed-on: https://go-review.googlesource.com/c/go/+/463835 Reviewed-by: Michael Pratt Run-TryBot: Felix Geisendörfer TryBot-Result: Gopher Robot Reviewed-by: Michael Knyszek --- src/runtime/asm_amd64.s | 4 + src/runtime/asm_arm64.s | 4 + src/runtime/extern.go | 5 ++ src/runtime/os_js.go | 3 + src/runtime/runtime1.go | 2 + src/runtime/stubs_386.go | 3 + src/runtime/stubs_amd64.go | 4 + src/runtime/stubs_arm.go | 3 + src/runtime/stubs_arm64.go | 4 + src/runtime/stubs_loong64.go | 3 + src/runtime/stubs_mips64x.go | 3 + src/runtime/stubs_mipsx.go | 3 + src/runtime/stubs_ppc64x.go | 3 + src/runtime/stubs_riscv64.go | 3 + src/runtime/stubs_s390x.go | 3 + src/runtime/trace.go | 151 ++++++++++++++++++++++++++++++----- 16 files changed, 183 insertions(+), 18 deletions(-) diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 690d6bacf0..b4c03d7624 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -2087,3 +2087,7 @@ TEXT runtime·retpolineR12(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(12) TEXT runtime·retpolineR13(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(13) TEXT runtime·retpolineR14(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(14) TEXT runtime·retpolineR15(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(15) + +TEXT ·getcallerfp(SB),NOSPLIT|NOFRAME,$0 + MOVQ BP, AX + RET diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index 7b659e3929..e35131051c 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -1566,3 +1566,7 @@ TEXT runtime·panicSliceConvert(SB),NOSPLIT,$0-16 MOVD R2, R0 MOVD R3, R1 JMP runtime·goPanicSliceConvert(SB) + +TEXT ·getcallerfp(SB),NOSPLIT|NOFRAME,$0 + MOVD R29, R0 + RET diff --git a/src/runtime/extern.go b/src/runtime/extern.go index 8b92108c70..03d593906e 100644 --- a/src/runtime/extern.go +++ b/src/runtime/extern.go @@ -179,6 +179,11 @@ It is a comma-separated list of name=val pairs setting these named variables: IDs will refer to the ID of the goroutine at the time of creation; it's possible for this ID to be reused for another goroutine. Setting N to 0 will report no ancestry information. + tracefpunwindoff: setting tracefpunwindoff=1 forces the execution tracer to + use the runtime's default stack unwinder instead of frame pointer unwinding. + This increases tracer overhead, but could be helpful as a workaround or for + debugging unexpected regressions caused by frame pointer unwinding. + asyncpreemptoff: asyncpreemptoff=1 disables signal-based asynchronous goroutine preemption. This makes some loops non-preemptible for long periods, which may delay GC and diff --git a/src/runtime/os_js.go b/src/runtime/os_js.go index 79fdbcfc07..20ac524552 100644 --- a/src/runtime/os_js.go +++ b/src/runtime/os_js.go @@ -169,3 +169,6 @@ const preemptMSupported = false func preemptM(mp *m) { // No threads, so nothing to do. } + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go index 02237685c7..68a090a3c7 100644 --- a/src/runtime/runtime1.go +++ b/src/runtime/runtime1.go @@ -325,6 +325,7 @@ var debug struct { asyncpreemptoff int32 harddecommit int32 adaptivestackstart int32 + tracefpunwindoff int32 // debug.malloc is used as a combined debug check // in the malloc function and should be set @@ -359,6 +360,7 @@ var dbgvars = []*dbgVar{ {name: "inittrace", value: &debug.inittrace}, {name: "harddecommit", value: &debug.harddecommit}, {name: "adaptivestackstart", value: &debug.adaptivestackstart}, + {name: "tracefpunwindoff", value: &debug.tracefpunwindoff}, {name: "panicnil", atomic: &debug.panicnil}, } diff --git a/src/runtime/stubs_386.go b/src/runtime/stubs_386.go index 300f167fff..ef531367c9 100644 --- a/src/runtime/stubs_386.go +++ b/src/runtime/stubs_386.go @@ -18,3 +18,6 @@ func emptyfunc() //go:noescape func asmcgocall_no_g(fn, arg unsafe.Pointer) + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/stubs_amd64.go b/src/runtime/stubs_amd64.go index 687a506cdd..3e7cf4f383 100644 --- a/src/runtime/stubs_amd64.go +++ b/src/runtime/stubs_amd64.go @@ -47,3 +47,7 @@ func asmcgocall_no_g(fn, arg unsafe.Pointer) // respectively. Does not follow the Go ABI. func spillArgs() func unspillArgs() + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +// TODO: Make this a compiler intrinsic +func getcallerfp() uintptr diff --git a/src/runtime/stubs_arm.go b/src/runtime/stubs_arm.go index 52c32937ae..be40a2b06a 100644 --- a/src/runtime/stubs_arm.go +++ b/src/runtime/stubs_arm.go @@ -23,3 +23,6 @@ func read_tls_fallback() //go:noescape func asmcgocall_no_g(fn, arg unsafe.Pointer) + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/stubs_arm64.go b/src/runtime/stubs_arm64.go index bd0533d158..723337ce52 100644 --- a/src/runtime/stubs_arm64.go +++ b/src/runtime/stubs_arm64.go @@ -21,3 +21,7 @@ func emptyfunc() // respectively. Does not follow the Go ABI. func spillArgs() func unspillArgs() + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +// TODO: Make this a compiler intrinsic +func getcallerfp() uintptr diff --git a/src/runtime/stubs_loong64.go b/src/runtime/stubs_loong64.go index 22366f508c..0575c3093d 100644 --- a/src/runtime/stubs_loong64.go +++ b/src/runtime/stubs_loong64.go @@ -9,3 +9,6 @@ package runtime // Called from assembly only; declared for go vet. func load_g() func save_g() + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/stubs_mips64x.go b/src/runtime/stubs_mips64x.go index a9ddfc0256..dbc4424f6c 100644 --- a/src/runtime/stubs_mips64x.go +++ b/src/runtime/stubs_mips64x.go @@ -14,3 +14,6 @@ func save_g() //go:noescape func asmcgocall_no_g(fn, arg unsafe.Pointer) + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/stubs_mipsx.go b/src/runtime/stubs_mipsx.go index d48f9b88e8..abae8418a1 100644 --- a/src/runtime/stubs_mipsx.go +++ b/src/runtime/stubs_mipsx.go @@ -9,3 +9,6 @@ package runtime // Called from assembly only; declared for go vet. func load_g() func save_g() + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/stubs_ppc64x.go b/src/runtime/stubs_ppc64x.go index 95e43a5162..67035eb7e8 100644 --- a/src/runtime/stubs_ppc64x.go +++ b/src/runtime/stubs_ppc64x.go @@ -15,3 +15,6 @@ func reginit() // respectively. Does not follow the Go ABI. func spillArgs() func unspillArgs() + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/stubs_riscv64.go b/src/runtime/stubs_riscv64.go index f677117871..0df1659c4a 100644 --- a/src/runtime/stubs_riscv64.go +++ b/src/runtime/stubs_riscv64.go @@ -14,3 +14,6 @@ func save_g() // respectively. Does not follow the Go ABI. func spillArgs() func unspillArgs() + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/stubs_s390x.go b/src/runtime/stubs_s390x.go index 44c566e602..799f4e948f 100644 --- a/src/runtime/stubs_s390x.go +++ b/src/runtime/stubs_s390x.go @@ -7,3 +7,6 @@ package runtime // Called from assembly only; declared for go vet. func load_g() func save_g() + +// getcallerfp returns the address of the frame pointer in the callers frame or 0 if not implemented. +func getcallerfp() uintptr { return 0 } diff --git a/src/runtime/trace.go b/src/runtime/trace.go index 14364ea5a7..c152b10336 100644 --- a/src/runtime/trace.go +++ b/src/runtime/trace.go @@ -260,7 +260,7 @@ func StartTrace() error { gp.traceseq = 0 gp.tracelastp = getg().m.p // +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum. - id := trace.stackTab.put([]uintptr{startPCforTrace(gp.startpc) + sys.PCQuantum}) + id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(gp.startpc) + sys.PCQuantum}) traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID) } if status == _Gwaiting { @@ -278,7 +278,7 @@ func StartTrace() error { gp.traceseq = 0 gp.tracelastp = getg().m.p // +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum. - id := trace.stackTab.put([]uintptr{startPCforTrace(0) + sys.PCQuantum}) // no start pc + id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(0) + sys.PCQuantum}) // no start pc traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID) gp.traceseq++ traceEvent(traceEvGoInSyscall, -1, gp.goid) @@ -862,27 +862,55 @@ func traceReadCPU() { }) buf = bufp.ptr() } - for i := range stk { - if i >= len(buf.stk) { - break - } - buf.stk[i] = uintptr(stk[i]) + nstk := 1 + buf.stk[0] = logicalStackSentinel + for ; nstk < len(buf.stk) && nstk-1 < len(stk); nstk++ { + buf.stk[nstk] = uintptr(stk[nstk-1]) } - stackID := trace.stackTab.put(buf.stk[:len(stk)]) + stackID := trace.stackTab.put(buf.stk[:nstk]) traceEventLocked(0, nil, 0, bufp, traceEvCPUSample, stackID, 1, timestamp/traceTickDiv, ppid, goid) } } } -func traceStackID(mp *m, buf []uintptr, skip int) uint64 { +// logicalStackSentinel is a sentinel value at pcBuf[0] signifying that +// pcBuf[1:] holds a logical stack requiring no further processing. Any other +// value at pcBuf[0] represents a skip value to apply to the physical stack in +// pcBuf[1:] after inline expansion. +const logicalStackSentinel = ^uintptr(0) + +// traceStackID captures a stack trace into pcBuf, registers it in the trace +// stack table, and returns its unique ID. pcBuf should have a length equal to +// traceStackSize. skip controls the number of leaf frames to omit in order to +// hide tracer internals from stack traces, see CL 5523. +func traceStackID(mp *m, pcBuf []uintptr, skip int) uint64 { gp := getg() curgp := mp.curg - var nstk int - if curgp == gp { - nstk = callers(skip+1, buf) - } else if curgp != nil { - nstk = gcallers(curgp, skip, buf) + nstk := 1 + if tracefpunwindoff() { + // Slow path: Unwind using default unwinder. Used when frame pointer + // unwinding is unavailable or disabled. + pcBuf[0] = logicalStackSentinel + if curgp == gp { + nstk += callers(skip+1, pcBuf[1:]) + } else if curgp != nil { + nstk += gcallers(curgp, skip, pcBuf[1:]) + } + } else { + // Fast path: Unwind using frame pointers. + pcBuf[0] = uintptr(skip) + if curgp == gp { + nstk += fpTracebackPCs(unsafe.Pointer(getcallerfp()), skip, pcBuf[1:]) + } else if curgp != nil { + // We're called on the g0 stack through mcall(fn) or systemstack(fn). To + // behave like gcallers above, we start unwinding from sched.bp, which + // points to the caller frame of the leaf frame on g's stack. The return + // address of the leaf frame is stored in sched.pc, which we manually + // capture here. + pcBuf[1] = curgp.sched.pc + nstk += 1 + fpTracebackPCs(unsafe.Pointer(curgp.sched.bp), skip, pcBuf[2:]) + } } if nstk > 0 { nstk-- // skip runtime.goexit @@ -890,10 +918,32 @@ func traceStackID(mp *m, buf []uintptr, skip int) uint64 { if nstk > 0 && curgp.goid == 1 { nstk-- // skip runtime.main } - id := trace.stackTab.put(buf[:nstk]) + id := trace.stackTab.put(pcBuf[:nstk]) return uint64(id) } +// tracefpunwindoff returns false if frame pointer unwinding for the tracer is +// disabled via GODEBUG or not supported by the architecture. +func tracefpunwindoff() bool { + // compiler emits frame pointers for amd64 and arm64, but issue 58432 blocks + // arm64 support for now. + return debug.tracefpunwindoff != 0 || goarch.ArchFamily != goarch.AMD64 +} + +// fpTracebackPCs populates pcBuf with the return addresses for each frame and +// returns the number of PCs written to pcBuf. The returned PCs correspond to +// "physical frames" rather than "logical frames"; that is if A is inlined into +// B, this will return a PC for only B. +func fpTracebackPCs(fp unsafe.Pointer, skip int, pcBuf []uintptr) (i int) { + for i = 0; i < len(pcBuf) && fp != nil; i++ { + // return addr sits one word above the frame pointer + pcBuf[i] = *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize)) + // follow the frame pointer to the next one + fp = unsafe.Pointer(*(*uintptr)(fp)) + } + return i +} + // traceAcquireBuffer returns trace buffer to use and, if necessary, locks it. func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) { // Any time we acquire a buffer, we may end up flushing it, @@ -1178,7 +1228,7 @@ func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr { stk := tab.tab[i].ptr() for ; stk != nil; stk = stk.link.ptr() { var frames []traceFrame - frames, bufp = traceFrames(bufp, stk.stack()) + frames, bufp = traceFrames(bufp, fpunwindExpand(stk.stack())) // Estimate the size of this record. This // bound is pretty loose, but avoids counting @@ -1218,6 +1268,62 @@ func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr { return bufp } +// fpunwindExpand checks if pcBuf contains logical frames (which include inlined +// frames) or physical frames (produced by frame pointer unwinding) using a +// sentinel value in pcBuf[0]. Logical frames are simply returned without the +// sentinel. Physical frames are turned into logical frames via inline unwinding +// and by applying the skip value that's stored in pcBuf[0]. +func fpunwindExpand(pcBuf []uintptr) []uintptr { + if len(pcBuf) > 0 && pcBuf[0] == logicalStackSentinel { + // pcBuf contains logical rather than inlined frames, skip has already been + // applied, just return it without the sentinel value in pcBuf[0]. + return pcBuf[1:] + } + + var ( + cache pcvalueCache + lastFuncID = funcID_normal + newPCBuf = make([]uintptr, 0, traceStackSize) + skip = pcBuf[0] + // skipOrAdd skips or appends retPC to newPCBuf and returns true if more + // pcs can be added. + skipOrAdd = func(retPC uintptr) bool { + if skip > 0 { + skip-- + } else { + newPCBuf = append(newPCBuf, retPC) + } + return len(newPCBuf) < cap(newPCBuf) + } + ) + +outer: + for _, retPC := range pcBuf[1:] { + callPC := retPC - 1 + fi := findfunc(callPC) + if !fi.valid() { + // There is no funcInfo if callPC belongs to a C function. In this case + // we still keep the pc, but don't attempt to expand inlined frames. + if more := skipOrAdd(retPC); !more { + break outer + } + continue + } + + u, uf := newInlineUnwinder(fi, callPC, &cache) + for ; uf.valid(); uf = u.next(uf) { + sf := u.srcFunc(uf) + if sf.funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) { + // ignore wrappers + } else if more := skipOrAdd(uf.pc + 1); !more { + break outer + } + lastFuncID = sf.funcID + } + } + return newPCBuf +} + type traceFrame struct { PC uintptr funcID uint64 @@ -1390,7 +1496,7 @@ func traceGoCreate(newg *g, pc uintptr) { newg.traceseq = 0 newg.tracelastp = getg().m.p // +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum. - id := trace.stackTab.put([]uintptr{startPCforTrace(pc) + sys.PCQuantum}) + id := trace.stackTab.put([]uintptr{logicalStackSentinel, startPCforTrace(pc) + sys.PCQuantum}) traceEvent(traceEvGoCreate, 2, newg.goid, uint64(id)) } @@ -1443,7 +1549,16 @@ func traceGoUnpark(gp *g, skip int) { } func traceGoSysCall() { - traceEvent(traceEvGoSysCall, 1) + if tracefpunwindoff() { + traceEvent(traceEvGoSysCall, 1) + } else { + // The default unwinder starts unwinding from gp.syscallsp + // which is captured 3 frames above this frame. We could + // capture gp.syscallbp to allow frame pointer unwinding to + // behave the same, but skipping 3 more frames here is + // simpler. + traceEvent(traceEvGoSysCall, 4) + } } func traceGoSysExit(ts int64) { -- 2.50.0