follows:
+------------------------------+
- | return PC |
- | frame pointer on entry | ← R29 points to
| ... locals ... |
| ... outgoing arguments ... |
- | unused word | ← RSP points to
+ | return PC | ← RSP points to
+ | frame pointer on entry |
+------------------------------+ ↓ lower addresses
The "return PC" is loaded to the link register, R30, as part of the
arm64 `CALL` operation.
-On entry, a function pushes R30 (the return address) and R29
-(the caller's frame pointer) onto the bottom of the stack. It then
-subtracts a constant from RSP to open its stack frame.
+On entry, a function subtracts from RSP to open its stack frame, and
+saves the values of R30 and R29 at the bottom of the frame.
+Specifically, R30 is saved at 0(RSP) and R29 is saved at -8(RSP),
+after RSP is updated.
A leaf function that does not require any stack space may omit the
saved R30 and R29.
)
func padframe(frame int64) int64 {
- // arm64 requires frame sizes here that are 8 mod 16.
- // With the additional (unused) slot at the bottom of the frame,
- // that makes an aligned 16 byte frame.
- // Adding a save region for LR+FP does not change the alignment.
- if frame != 0 {
- frame += (-(frame + 8)) & 15
+ // arm64 requires that the frame size (not counting saved FP&LR)
+ // be 16 bytes aligned. If not, pad it.
+ if frame%16 != 0 {
+ frame += 16 - (frame % 16)
}
return frame
}
for i := 0; i < len(args); i++ {
a := args[i]
- // Offset by size of the unused slot before start of args.
+ // Offset by size of the saved LR slot.
addr := ssagen.SpillSlotAddr(a, arm64.REGSP, base.Ctxt.Arch.FixedFrameSize)
// Look for double-register operations if we can.
if i < len(args)-1 {
case ir.PAUTO:
off = n.FrameOffset()
if base.Ctxt.Arch.FixedFrameSize == 0 {
- // x86 return address
off -= int64(types.PtrSize)
}
if buildcfg.FramePointerEnabled {
- // frame pointer
off -= int64(types.PtrSize)
- if buildcfg.GOARCH == "arm64" {
- // arm64 return address also
- off -= int64(types.PtrSize)
- }
}
}
return int32(off + slot.Off)
// Insert code to zero ambiguously live variables so that the
// garbage collector only sees initialized values when it
// looks for pointers.
- // Note: lo/hi are offsets from varp and will be negative.
var lo, hi int64
// Opaque state for backend to use. Current backends use it to
var state uint32
// Iterate through declarations. Autos are sorted in decreasing
- // frame offset order (least negative to most negative).
+ // frame offset order.
for _, n := range e.curfn.Dcl {
if !n.Needzero() {
continue
blitrl *obj.Prog
elitrl *obj.Prog
autosize int32
+ extrasize int32
instoffset int64
pc int64
pool struct {
ctxt.Diag("arm64 ops not initialized, call arm64.buildop first")
}
- c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset)}
+ c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset & 0xffffffff), extrasize: int32(p.To.Offset >> 32)}
+ p.To.Offset &= 0xffffffff // extrasize is no longer needed
// Process literal pool and allocate initial program counter for each Prog, before
// generating branch veneers.
// a.Offset is still relative to pseudo-SP.
a.Reg = obj.REG_NONE
}
- // The frame top 16 bytes are for LR/FP
- c.instoffset = int64(c.autosize) + a.Offset - extrasize
+ // The frame top 8 or 16 bytes are for FP
+ c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize)
return autoclass(c.instoffset)
case obj.NAME_PARAM:
// a.Offset is still relative to pseudo-SP.
a.Reg = obj.REG_NONE
}
- // The frame top 16 bytes are for LR/FP
- c.instoffset = int64(c.autosize) + a.Offset - extrasize
+ // The frame top 8 or 16 bytes are for FP
+ c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize)
case obj.NAME_PARAM:
if a.Reg == REGSP {
"cmd/internal/src"
"cmd/internal/sys"
"internal/abi"
+ "internal/buildcfg"
"log"
"math"
)
obj.Nopout(p)
}
-const extrasize = 16 // space needed in the frame for LR+FP
-
func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
if cursym.Func().Text == nil || cursym.Func().Text.Link == nil {
return
c.autosize = int32(textstksiz)
if p.Mark&LEAF != 0 && c.autosize == 0 {
- // A leaf function with no locals needs no frame.
+ // A leaf function with no locals has no frame.
p.From.Sym.Set(obj.AttrNoFrame, true)
}
if !p.From.Sym.NoFrame() {
// If there is a stack frame at all, it includes
- // space for the (now unused) word at [SP:SP+8].
+ // space to save the LR.
c.autosize += 8
}
- // Round up to a multiple of 16.
- c.autosize += (-c.autosize) & 15
-
if c.autosize != 0 {
- // Allocate an extra 16 bytes at the top of the frame
- // to save LR+FP.
+ extrasize := int32(0)
+ if c.autosize%16 == 8 {
+ // Allocate extra 8 bytes on the frame top to save FP
+ extrasize = 8
+ } else if c.autosize&(16-1) == 0 {
+ // Allocate extra 16 bytes to save FP for the old frame whose size is 8 mod 16
+ extrasize = 16
+ } else {
+ c.ctxt.Diag("%v: unaligned frame size %d - must be 16 aligned", p, c.autosize-8)
+ }
c.autosize += extrasize
c.cursym.Func().Locals += extrasize
- p.To.Offset = int64(c.autosize)
+ // low 32 bits for autosize
+ // high 32 bits for extrasize
+ p.To.Offset = int64(c.autosize) | int64(extrasize)<<32
} else {
// NOFRAME
p.To.Offset = 0
var prologueEnd *obj.Prog
aoffset := c.autosize
- if aoffset < 16 {
- log.Fatalf("aoffset too small %d", aoffset)
+ if aoffset > 0xf0 {
+ // MOVD.W offset variant range is -0x100 to 0xf8, SP should be 16-byte aligned.
+ // so the maximum aoffset value is 0xf0.
+ aoffset = 0xf0
}
+ // Frame is non-empty. Make sure to save link register, even if
+ // it is a leaf function, so that traceback works.
q = p
-
- // Store return address and frame pointer at the top of the stack frame.
- // STP.W (R29, R30), -16(SP)
- q1 = obj.Appendp(q, c.newprog)
- q1.Pos = p.Pos
- q1.As = ASTP
- q1.From.Type = obj.TYPE_REGREG
- q1.From.Reg = REGFP
- q1.From.Offset = REGLINK
- q1.To.Type = obj.TYPE_MEM
- q1.To.Reg = REG_RSP
- q1.To.Offset = -16
- q1.Scond = C_XPRE
-
- prologueEnd = q1
-
- // Update frame pointer
- q1 = obj.Appendp(q1, c.newprog)
- q1.Pos = p.Pos
- q1.As = AMOVD
- q1.From.Type = obj.TYPE_REG
- q1.From.Reg = REGSP
- q1.To.Type = obj.TYPE_REG
- q1.To.Reg = REGFP
-
- // Allocate additional frame space.
- adj := aoffset - 16
- if adj > 0 {
- // SUB $autosize-16, RSP
- if adj < 1<<12 {
- q1 = obj.Appendp(q1, c.newprog)
- q1.Pos = p.Pos
- q1.As = ASUB
- q1.From.Type = obj.TYPE_CONST
- q1.From.Offset = int64(adj)
- q1.To.Type = obj.TYPE_REG
- q1.To.Reg = REGSP
- } else {
- // Constant too big for atomic subtract.
- // Materialize in tmp register first.
- q1 = obj.Appendp(q1, c.newprog)
- q1.Pos = p.Pos
- q1.As = AMOVD
- q1.From.Type = obj.TYPE_CONST
- q1.From.Offset = int64(adj)
- q1.To.Type = obj.TYPE_REG
- q1.To.Reg = REGTMP
-
+ if c.autosize > aoffset {
+ // Frame size is too large for a MOVD.W instruction. Store the frame pointer
+ // register and link register before decrementing SP, so if a signal comes
+ // during the execution of the function prologue, the traceback code will
+ // not see a half-updated stack frame.
+
+ // SUB $autosize, RSP, R20
+ q1 = obj.Appendp(q, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = ASUB
+ q1.From.Type = obj.TYPE_CONST
+ q1.From.Offset = int64(c.autosize)
+ q1.Reg = REGSP
+ q1.To.Type = obj.TYPE_REG
+ q1.To.Reg = REG_R20
+
+ prologueEnd = q1
+
+ // STP (R29, R30), -8(R20)
+ q1 = obj.Appendp(q1, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = ASTP
+ q1.From.Type = obj.TYPE_REGREG
+ q1.From.Reg = REGFP
+ q1.From.Offset = REGLINK
+ q1.To.Type = obj.TYPE_MEM
+ q1.To.Reg = REG_R20
+ q1.To.Offset = -8
+
+ // This is not async preemptible, as if we open a frame
+ // at the current SP, it will clobber the saved LR.
+ q1 = c.ctxt.StartUnsafePoint(q1, c.newprog)
+
+ // MOVD R20, RSP
+ q1 = obj.Appendp(q1, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = AMOVD
+ q1.From.Type = obj.TYPE_REG
+ q1.From.Reg = REG_R20
+ q1.To.Type = obj.TYPE_REG
+ q1.To.Reg = REGSP
+ q1.Spadj = c.autosize
+
+ q1 = c.ctxt.EndUnsafePoint(q1, c.newprog, -1)
+
+ if buildcfg.GOOS == "ios" {
+ // iOS does not support SA_ONSTACK. We will run the signal handler
+ // on the G stack. If we write below SP, it may be clobbered by
+ // the signal handler. So we save FP and LR after decrementing SP.
+ // STP (R29, R30), -8(RSP)
q1 = obj.Appendp(q1, c.newprog)
q1.Pos = p.Pos
- q1.As = ASUB
- q1.From.Type = obj.TYPE_REG
- q1.From.Reg = REGTMP
- q1.To.Type = obj.TYPE_REG
+ q1.As = ASTP
+ q1.From.Type = obj.TYPE_REGREG
+ q1.From.Reg = REGFP
+ q1.From.Offset = REGLINK
+ q1.To.Type = obj.TYPE_MEM
q1.To.Reg = REGSP
+ q1.To.Offset = -8
}
- q1.Spadj = adj
+ } else {
+ // small frame, update SP and save LR in a single MOVD.W instruction.
+ // So if a signal comes during the execution of the function prologue,
+ // the traceback code will not see a half-updated stack frame.
+ // Also, on Linux, in a cgo binary we may get a SIGSETXID signal
+ // early on before the signal stack is set, as glibc doesn't allow
+ // us to block SIGSETXID. So it is important that we don't write below
+ // the SP until the signal stack is set.
+ // Luckily, all the functions from thread entry to setting the signal
+ // stack have small frames.
+ q1 = obj.Appendp(q, c.newprog)
+ q1.As = AMOVD
+ q1.Pos = p.Pos
+ q1.From.Type = obj.TYPE_REG
+ q1.From.Reg = REGLINK
+ q1.To.Type = obj.TYPE_MEM
+ q1.Scond = C_XPRE
+ q1.To.Offset = int64(-aoffset)
+ q1.To.Reg = REGSP
+ q1.Spadj = aoffset
+
+ prologueEnd = q1
+
+ // Frame pointer.
+ q1 = obj.Appendp(q1, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = AMOVD
+ q1.From.Type = obj.TYPE_REG
+ q1.From.Reg = REGFP
+ q1.To.Type = obj.TYPE_MEM
+ q1.To.Reg = REGSP
+ q1.To.Offset = -8
}
prologueEnd.Pos = prologueEnd.Pos.WithXlogue(src.PosPrologueEnd)
+ q1 = obj.Appendp(q1, c.newprog)
+ q1.Pos = p.Pos
+ q1.As = ASUB
+ q1.From.Type = obj.TYPE_CONST
+ q1.From.Offset = 8
+ q1.Reg = REGSP
+ q1.To.Type = obj.TYPE_REG
+ q1.To.Reg = REGFP
+
case obj.ARET:
nocache(p)
if p.From.Type == obj.TYPE_CONST {
}
p.To = obj.Addr{}
aoffset := c.autosize
- if aoffset > 0 {
- if aoffset < 16 {
- log.Fatalf("aoffset too small %d", aoffset)
- }
- adj := aoffset - 16
- if adj > 0 {
- if adj < 1<<12 {
- // ADD $adj, RSP, RSP
- p.As = AADD
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = int64(adj)
- p.To.Type = obj.TYPE_REG
- p.To.Reg = REGSP
- } else {
- // Put frame size in a separate register and
- // add it in with a single instruction,
- // so we never have a partial frame during
- // the epilog. See issue 73259.
-
- // MOVD $adj, REGTMP
- p.As = AMOVD
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = int64(adj)
- p.To.Type = obj.TYPE_REG
- p.To.Reg = REGTMP
- // ADD REGTMP, RSP, RSP
- p = obj.Appendp(p, c.newprog)
- p.As = AADD
- p.From.Type = obj.TYPE_REG
- p.From.Reg = REGTMP
- p.To.Type = obj.TYPE_REG
- p.To.Reg = REGSP
- }
- p.Spadj = -adj
- }
-
- // Pop LR+FP.
- // LDP.P 16(RSP), (R29, R30)
- if p.As != obj.ARET {
+ if c.cursym.Func().Text.Mark&LEAF != 0 {
+ if aoffset != 0 {
+ // Restore frame pointer.
+ // ADD $framesize-8, RSP, R29
+ p.As = AADD
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = int64(c.autosize) - 8
+ p.Reg = REGSP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = REGFP
+
+ // Pop stack frame.
+ // ADD $framesize, RSP, RSP
p = obj.Appendp(p, c.newprog)
+ p.As = AADD
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = int64(c.autosize)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = REGSP
+ p.Spadj = -c.autosize
}
- p.As = ALDP
+ } else if aoffset <= 0xF0 {
+ // small frame, restore LR and update SP in a single MOVD.P instruction.
+ // There is no correctness issue to use a single LDP for LR and FP,
+ // but the instructions are not pattern matched with the prologue's
+ // MOVD.W and MOVD, which may cause performance issue in
+ // store-forwarding.
+
+ // MOVD -8(RSP), R29
+ p.As = AMOVD
p.From.Type = obj.TYPE_MEM
p.From.Reg = REGSP
- p.From.Offset = 16
+ p.From.Offset = -8
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = REGFP
+ p = obj.Appendp(p, c.newprog)
+
+ // MOVD.P offset(RSP), R30
+ p.As = AMOVD
+ p.From.Type = obj.TYPE_MEM
p.Scond = C_XPOST
+ p.From.Offset = int64(aoffset)
+ p.From.Reg = REGSP
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = REGLINK
+ p.Spadj = -aoffset
+ } else {
+ // LDP -8(RSP), (R29, R30)
+ p.As = ALDP
+ p.From.Type = obj.TYPE_MEM
+ p.From.Offset = -8
+ p.From.Reg = REGSP
p.To.Type = obj.TYPE_REGREG
p.To.Reg = REGFP
p.To.Offset = REGLINK
- p.Spadj = -16
+
+ if aoffset < 1<<12 {
+ // ADD $aoffset, RSP, RSP
+ q = newprog()
+ q.As = AADD
+ q.From.Type = obj.TYPE_CONST
+ q.From.Offset = int64(aoffset)
+ q.To.Type = obj.TYPE_REG
+ q.To.Reg = REGSP
+ q.Spadj = -aoffset
+ q.Pos = p.Pos
+ q.Link = p.Link
+ p.Link = q
+ p = q
+ } else {
+ // Put frame size in a separate register and
+ // add it in with a single instruction,
+ // so we never have a partial frame during
+ // the epilog. See issue 73259.
+
+ // MOVD $aoffset, REGTMP
+ q = newprog()
+ q.As = AMOVD
+ q.From.Type = obj.TYPE_CONST
+ q.From.Offset = int64(aoffset)
+ q.To.Type = obj.TYPE_REG
+ q.To.Reg = REGTMP
+ q.Pos = p.Pos
+ q.Link = p.Link
+ p.Link = q
+ p = q
+ // ADD REGTMP, RSP, RSP
+ q = newprog()
+ q.As = AADD
+ q.From.Type = obj.TYPE_REG
+ q.From.Reg = REGTMP
+ q.To.Type = obj.TYPE_REG
+ q.To.Reg = REGSP
+ q.Spadj = -aoffset
+ q.Pos = p.Pos
+ q.Link = p.Link
+ p.Link = q
+ p = q
+ }
}
// If enabled, this code emits 'MOV PC, R27' before every 'MOV LR, PC',
p.From.Type = obj.TYPE_REG
p.From.Reg = REGLINK
} else {
- /* MOVD framesize-8(RSP), Rd */
+ /* MOVD (RSP), Rd */
p.As = AMOVD
p.From.Type = obj.TYPE_MEM
p.From.Reg = REGSP
- p.From.Offset = int64(c.autosize - 8)
}
}
if p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP && p.Spadj == 0 {
p.From.Reg = int16(REG_LSL + r + (shift&7)<<5)
p.From.Offset = 0
}
- if p.To.Type == obj.TYPE_MEM && p.To.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) {
- p.Spadj += int32(-p.To.Offset)
- }
- if p.From.Type == obj.TYPE_MEM && p.From.Reg == REG_RSP && (p.Scond == C_XPRE || p.Scond == C_XPOST) {
- p.Spadj += int32(-p.From.Offset)
- }
}
}
Plan9Magic: uint32(4*26*26 + 7),
Plan9_64Bit: true,
- Adddynrel: adddynrel,
- Archinit: archinit,
- Archreloc: archreloc,
- Archrelocvariant: archrelocvariant,
- Gentext: gentext,
- Machoreloc1: machoreloc1,
- MachorelocSize: 8,
- PEreloc1: pereloc1,
- TLSIEtoLE: tlsIEtoLE,
- ReturnAddressAtTopOfFrame: true,
+ Adddynrel: adddynrel,
+ Archinit: archinit,
+ Archreloc: archreloc,
+ Archrelocvariant: archrelocvariant,
+ Gentext: gentext,
+ Machoreloc1: machoreloc1,
+ MachorelocSize: 8,
+ PEreloc1: pereloc1,
+ TLSIEtoLE: tlsIEtoLE,
ELF: ld.ELFArch{
Linuxdynld: "/lib64/ld-linux-x86-64.so.2",
Dwarfreglr: dwarfRegLR,
TrampLimit: 0x7c00000, // 26-bit signed offset * 4, leave room for PLT etc.
- Adddynrel: adddynrel,
- Archinit: archinit,
- Archreloc: archreloc,
- Archrelocvariant: archrelocvariant,
- Extreloc: extreloc,
- Gentext: gentext,
- GenSymsLate: gensymlate,
- Machoreloc1: machoreloc1,
- MachorelocSize: 8,
- PEreloc1: pereloc1,
- Trampoline: trampoline,
- ReturnAddressAtTopOfFrame: true,
+ Adddynrel: adddynrel,
+ Archinit: archinit,
+ Archreloc: archreloc,
+ Archrelocvariant: archrelocvariant,
+ Extreloc: extreloc,
+ Gentext: gentext,
+ GenSymsLate: gensymlate,
+ Machoreloc1: machoreloc1,
+ MachorelocSize: 8,
+ PEreloc1: pereloc1,
+ Trampoline: trampoline,
ELF: ld.ELFArch{
Androiddynld: "/system/bin/linker64",
if pcsp.Value > 0 {
// The return address is preserved at (CFA-frame_size)
// after a stack frame has been allocated.
- off := -spdelta
- if thearch.ReturnAddressAtTopOfFrame {
- // Except arm64, which has it at the top of frame.
- off = -int64(d.arch.PtrSize)
- }
deltaBuf = append(deltaBuf, dwarf.DW_CFA_offset_extended_sf)
deltaBuf = dwarf.AppendUleb128(deltaBuf, uint64(thearch.Dwarfreglr))
- deltaBuf = dwarf.AppendSleb128(deltaBuf, off/dataAlignmentFactor)
+ deltaBuf = dwarf.AppendSleb128(deltaBuf, -spdelta/dataAlignmentFactor)
} else {
// The return address is restored into the link register
// when a stack frame has been de-allocated.
// optional override for assignAddress
AssignAddress func(ldr *loader.Loader, sect *sym.Section, n int, s loader.Sym, va uint64, isTramp bool) (*sym.Section, int, uint64)
- // Reports whether the return address is stored at the top (highest address)
- // of the stack frame.
- ReturnAddressAtTopOfFrame bool
-
// ELF specific information.
ELF ELFArch
}
"cmd/internal/objabi"
"cmd/link/internal/loader"
"fmt"
+ "internal/buildcfg"
"sort"
"strings"
)
// that there are at least StackLimit bytes available below SP
// when morestack returns.
limit := objabi.StackNosplit(*flagRace) - sc.callSize
+ if buildcfg.GOARCH == "arm64" {
+ // Need an extra 8 bytes below SP to save FP.
+ limit -= 8
+ }
// Compute stack heights without any back-tracking information.
// This will almost certainly succeed and we can simply
Plan9Magic: uint32(4*11*11 + 7),
- Adddynrel: adddynrel,
- Archinit: archinit,
- Archreloc: archreloc,
- Archrelocvariant: archrelocvariant,
- Gentext: gentext,
- Machoreloc1: machoreloc1,
- PEreloc1: pereloc1,
- ReturnAddressAtTopOfFrame: true,
+ Adddynrel: adddynrel,
+ Archinit: archinit,
+ Archreloc: archreloc,
+ Archrelocvariant: archrelocvariant,
+ Gentext: gentext,
+ Machoreloc1: machoreloc1,
+ PEreloc1: pereloc1,
ELF: ld.ELFArch{
Linuxdynld: "/lib/ld-linux.so.2",
CBZ R4, nocgo
MOVD $_rt0_arm64_lib_go(SB), R0
MOVD $0, R1
+ SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved.
BL (R4)
+ ADD $16, RSP
B restore
nocgo:
BL runtime·save_g(SB)
MOVD (g_sched+gobuf_sp)(g), R0
MOVD R0, RSP
+ MOVD (g_sched+gobuf_bp)(g), R29
MOVD $0, (g_sched+gobuf_sp)(g)
MOVD $0, (g_sched+gobuf_bp)(g)
RET
// Using a tail call here cleans up tracebacks since we won't stop
// at an intermediate systemstack.
MOVD 0(R26), R3 // code pointer
- ADD $16, RSP
- LDP.P 16(RSP), (R29,R30) // restore FP, LR
+ MOVD.P 16(RSP), R30 // restore LR
+ SUB $8, RSP, R29 // restore FP
B (R3)
// func switchToCrashStack0(fn func())
// Smashes R0.
TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0
MOVD $runtime·systemstack_switch(SB), R0
- ADD $12, R0 // get past prologue
+ ADD $8, R0 // get past prologue
MOVD R0, (g_sched+gobuf_pc)(g)
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
TEXT ·asmcgocall_no_g(SB),NOSPLIT,$0-16
MOVD fn+0(FP), R1
MOVD arg+8(FP), R0
+ SUB $16, RSP // skip over saved frame pointer below RSP
BL (R1)
+ ADD $16, RSP // skip over saved frame pointer below RSP
RET
// func asmcgocall(fn, arg unsafe.Pointer) int32
BL runtime·save_g(SB)
MOVD (g_sched+gobuf_sp)(g), R4 // prepare stack as R4
MOVD (g_sched+gobuf_pc)(g), R5
- MOVD R5, -8(R4)
+ MOVD R5, -48(R4)
MOVD (g_sched+gobuf_bp)(g), R5
- MOVD R5, -16(R4)
+ MOVD R5, -56(R4)
// Gather our arguments into registers.
MOVD fn+0(FP), R1
MOVD frame+8(FP), R2
CALL (R0) // indirect call to bypass nosplit check. We're on a different stack now.
// Restore g->sched (== m->curg->sched) from saved values.
- MOVD 40(RSP), R5
+ MOVD 0(RSP), R5
MOVD R5, (g_sched+gobuf_pc)(g)
MOVD RSP, R4
ADD $48, R4, R4
//
// This is ABIInternal because Go code injects its PC directly into new
// goroutine stacks.
-//
-// State before debugger starts doing anything:
-// | current |
-// | stack |
-// +-------------+ <- SP = origSP
-// stopped executing at PC = origPC
-// some values are in LR (origLR) and FP (origFP)
-//
-// After debugger has done steps 1-6 above:
-// | current |
-// | stack |
-// +-------------+ <- origSP
-// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.)
-// +-------------+
-// | origLR |
-// +-------------+ <- SP
-// | ----- |
-// +-------------+
-// | argsize |
-// +-------------+
-// LR = origPC, PC = debugCallV2
-//
-// debugCallV2 then modifies the stack up to the "good" label:
-// | current |
-// | stack |
-// +-------------+ <- origSP
-// | ----- | (used to be a slot to store frame pointer on entry to origPC's frame.)
-// +-------------+
-// | origLR |
-// +-------------+ <- where debugger left SP
-// | origPC |
-// +-------------+
-// | origFP |
-// +-------------+ <- FP = SP + 256
-// | saved |
-// | registers |
-// | (224 bytes) |
-// +-------------+ <- SP + 32
-// | space for |
-// | outargs |
-// +-------------+ <- SP + 8
-// | argsize |
-// +-------------+ <- SP
-
TEXT runtime·debugCallV2<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0
- MOVD R30, -8(RSP) // save origPC
- MOVD -16(RSP), R30 // save argsize in R30 temporarily
- MOVD.W R29, -16(RSP) // push origFP
- MOVD RSP, R29 // frame pointer chain now set up
- SUB $256, RSP, RSP // allocate frame
- MOVD R30, (RSP) // Save argsize on the stack
+ STP (R29, R30), -280(RSP)
+ SUB $272, RSP, RSP
+ SUB $8, RSP, R29
// Save all registers that may contain pointers so they can be
// conservatively scanned.
//
STP (R0, R1), (4*8)(RSP)
// Perform a safe-point check.
- MOVD 264(RSP), R0 // origPC
- MOVD R0, 8(RSP)
+ MOVD R30, 8(RSP) // Caller's PC
CALL runtime·debugCallCheck(SB)
MOVD 16(RSP), R0
CBZ R0, good
CALL runtime·debugCallWrap(SB); \
JMP restore
- MOVD (RSP), R0 // the argument frame size
+ MOVD 256(RSP), R0 // the argument frame size
DEBUG_CALL_DISPATCH(debugCall32<>, 32)
DEBUG_CALL_DISPATCH(debugCall64<>, 64)
DEBUG_CALL_DISPATCH(debugCall128<>, 128)
LDP (6*8)(RSP), (R2, R3)
LDP (4*8)(RSP), (R0, R1)
- MOVD 272(RSP), R30 // restore old lr (saved by (*sigctxt).pushCall)
- LDP 256(RSP), (R29, R27) // restore old fp, set up resumption address
- ADD $288, RSP, RSP // Pop frame, LR+FP, and block pushed by (*sigctxt).pushCall
+ LDP -8(RSP), (R29, R27)
+ ADD $288, RSP, RSP // Add 16 more bytes, see saveSigContext
+ MOVD -16(RSP), R30 // restore old lr
JMP (R27)
// runtime.debugCallCheck assumes that functions defined with the
l.stack += 8 // SP needs 16-byte alignment
}
- // allocate frame, save PC (in R30), FP (in R29) of interrupted instruction
- p("STP.W (R29, R30), -16(RSP)")
- p("MOVD RSP, R29") // set up new frame pointer
+ // allocate frame, save PC of interrupted instruction (in LR)
+ p("MOVD R30, %d(RSP)", -l.stack)
p("SUB $%d, RSP", l.stack)
+ p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
+ p("SUB $8, RSP, R29") // set up new frame pointer
+ // On iOS, save the LR again after decrementing SP. We run the
+ // signal handler on the G stack (as it doesn't support sigaltstack),
+ // so any writes below SP may be clobbered.
+ p("#ifdef GOOS_ios")
+ p("MOVD R30, (RSP)")
+ p("#endif")
l.save(g)
p("CALL ·asyncPreempt2(SB)")
l.restore(g)
- p("MOVD %d(RSP), R30", l.stack+16) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
- p("LDP %d(RSP), (R29, R27)", l.stack) // Restore frame pointer. Load PC into regtmp.
- p("ADD $%d, RSP", l.stack+32) // pop frame (including the space pushed by sigctxt.pushCall)
+ p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
+ p("MOVD -8(RSP), R29") // restore frame pointer
+ p("MOVD (RSP), R27") // load PC to REGTMP
+ p("ADD $%d, RSP", l.stack+16) // pop frame (including the space pushed by sigctxt.pushCall)
p("RET (R27)")
}
// the caller
gp.sched.bp = fp - 2*goarch.PtrSize
case goarch.IsArm64 != 0:
- // on arm64, the first two words of the frame are caller's PC
- // (the saved LR register) and the caller's BP.
- // Coincidentally, the same as amd64.
- gp.sched.bp = fp - 2*goarch.PtrSize
+ // on arm64, the architectural bp points one word higher
+ // than the sp. fp is totally useless to us here, because it
+ // only gets us to the caller's fp.
+ gp.sched.bp = sp - goarch.PtrSize
}
gogo(&gp.sched)
}
#include "textflag.h"
TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
- STP.W (R29, R30), -16(RSP)
- MOVD RSP, R29
+ MOVD R30, -496(RSP)
SUB $496, RSP
+ MOVD R29, -8(RSP)
+ SUB $8, RSP, R29
+ #ifdef GOOS_ios
+ MOVD R30, (RSP)
+ #endif
STP (R0, R1), 8(RSP)
STP (R2, R3), 24(RSP)
STP (R4, R5), 40(RSP)
LDP 40(RSP), (R4, R5)
LDP 24(RSP), (R2, R3)
LDP 8(RSP), (R0, R1)
- MOVD 512(RSP), R30
- LDP 496(RSP), (R29, R27)
- ADD $528, RSP
+ MOVD 496(RSP), R30
+ MOVD -8(RSP), R29
+ MOVD (RSP), R27
+ ADD $512, RSP
RET (R27)
// R3 = addr of incoming arg list
// Trigger SIGSEGV early.
- MOVD 72(RSP), R3 // 1st arg is addr. after two small frames (32 bytes each), get it at 72(RSP)
+ MOVD 40(RSP), R3 // 1st arg is addr. after two times BL, get it at 40(RSP)
MOVB (R3), R13 // segv here if addr is bad
// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
MOVD runtime·racearenastart(SB), R10
// Addr is within the good range, call the atomic function.
load_g
MOVD g_racectx(g), R0 // goroutine context
- MOVD 56(RSP), R1 // caller pc
+ MOVD 16(RSP), R1 // caller pc
MOVD R9, R2 // pc
- ADD $72, RSP, R3
- BL racecall<>(SB)
- RET
+ ADD $40, RSP, R3
+ JMP racecall<>(SB) // does not return
racecallatomic_ignore:
// Addr is outside the good range.
// Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op.
// racecall will call LLVM race code which might clobber R28 (g)
load_g
MOVD g_racectx(g), R0 // goroutine context
- MOVD 56(RSP), R1 // caller pc
+ MOVD 16(RSP), R1 // caller pc
MOVD R9, R2 // pc
- ADD $72, RSP, R3 // arguments
+ ADD $40, RSP, R3 // arguments
BL racecall<>(SB)
// Call __tsan_go_ignore_sync_end.
MOVD $__tsan_go_ignore_sync_end(SB), R9
MOVD (g_sched+gobuf_sp)(R11), R12
MOVD R12, RSP
call:
+ // Decrement SP past where the frame pointer is saved in the Go arm64
+ // ABI (one word below the stack pointer) so the race detector library
+ // code doesn't clobber it
+ SUB $16, RSP
BL R9
MOVD R19, RSP
JMP (R20)
import (
"internal/abi"
+ "internal/goarch"
"internal/runtime/sys"
"unsafe"
)
// We arrange lr, and pc to pretend the panicking
// function calls sigpanic directly.
// Always save LR to stack so that panics in leaf
- // functions are correctly handled.
- // This extra space is known to gentraceback.
+ // functions are correctly handled. This smashes
+ // the stack frame but we're not going back there
+ // anyway.
sp := c.sp() - sys.StackAlign // needs only sizeof uint64, but must align the stack
c.set_sp(sp)
*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr()
+ // Make sure a valid frame pointer is saved on the stack so that the
+ // frame pointer checks in adjustframe are happy, if they're enabled.
+ // Frame pointer unwinding won't visit the sigpanic frame, since
+ // sigpanic will save the same frame pointer before calling into a panic
+ // function.
+ *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29()
pc := gp.sigpc
sp := c.sp() - 16 // SP needs 16-byte alignment
c.set_sp(sp)
*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr()
+ // Make sure a valid frame pointer is saved on the stack so that the
+ // frame pointer checks in adjustframe are happy, if they're enabled.
+ // This is not actually used for unwinding.
+ *(*uint64)(unsafe.Pointer(uintptr(sp - goarch.PtrSize))) = c.r29()
// Set up PC and LR to pretend the function being signaled
// calls targetPC at resumePC.
c.set_lr(uint64(resumePC))
// | args to callee |
// +------------------+ <- frame->sp
//
-// (arm64)
+// (arm)
// +------------------+
// | args from caller |
// +------------------+ <- frame->argp
-// | <unused> |
-// +------------------+ <- frame->fp (aka caller's sp)
-// | return address |
+// | caller's retaddr |
// +------------------+
-// | caller's FP | (frame pointer always enabled: TODO)
+// | caller's FP (*) | (*) on ARM64, if framepointer_enabled && varp > sp
// +------------------+ <- frame->varp
// | locals |
// +------------------+
// | args to callee |
// +------------------+
-// | <unused> |
+// | return address |
// +------------------+ <- frame->sp
//
// varp > sp means that the function has a frame;
// varp == sp means frameless function.
-//
-// Alignment padding, if needed, will be between "locals" and "args to callee".
type adjustinfo struct {
old stack
}
// Adjust saved frame pointer if there is one.
- if goarch.ArchFamily == goarch.AMD64 && frame.argp-frame.varp == 2*goarch.PtrSize ||
- goarch.ArchFamily == goarch.ARM64 && frame.argp-frame.varp == 3*goarch.PtrSize {
+ if (goarch.ArchFamily == goarch.AMD64 || goarch.ArchFamily == goarch.ARM64) && frame.argp-frame.varp == 2*goarch.PtrSize {
if stackDebug >= 3 {
print(" saved bp\n")
}
throw("bad frame pointer")
}
}
- // This is the caller's frame pointer saved in the current frame.
+ // On AMD64, this is the caller's frame pointer saved in the current
+ // frame.
+ // On ARM64, this is the frame pointer of the caller's caller saved
+ // by the caller in its frame (one word below its SP).
adjustpointer(adjinfo, unsafe.Pointer(frame.varp))
}
if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" {
lrOff = 32 // FIXED_FRAME or sys.MinFrameSize
}
- if runtime.GOARCH == "arm64" {
- // skip 8 bytes at bottom of parent frame, then point
- // to the 8 bytes of the saved PC at the top of the frame.
- lrOff = 16
- }
lrPtr := (*uintptr)(unsafe.Pointer(uintptr(unsafe.Pointer(&arg)) - lrOff))
*lrPtr = 0xbad
// Start in the caller's frame.
if frame.pc == 0 {
if usesLR {
- // TODO: this isn't right on arm64. But also, this should
- // ~never happen. Calling a nil function will panic
- // when loading the PC out of the closure, not when
- // branching to that PC. (Closures should always have
- // valid PCs in their first word.)
frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp))
frame.lr = 0
} else {
var lrPtr uintptr
if usesLR {
if innermost && frame.sp < frame.fp || frame.lr == 0 {
- if GOARCH == "arm64" {
- lrPtr = frame.fp - goarch.PtrSize
- } else {
- lrPtr = frame.sp
- }
+ lrPtr = frame.sp
frame.lr = *(*uintptr)(unsafe.Pointer(lrPtr))
}
} else {
// On x86, call instruction pushes return PC before entering new function.
frame.varp -= goarch.PtrSize
}
- if GOARCH == "arm64" && frame.varp > frame.sp {
- frame.varp -= goarch.PtrSize // LR have been saved, skip over it.
- }
// For architectures with frame pointers, if there's
// a frame, then there's a saved frame pointer here.
//
// NOTE: This code is not as general as it looks.
- // On x86 and arm64, the ABI is to save the frame pointer word at the
+ // On x86, the ABI is to save the frame pointer word at the
// top of the stack frame, so we have to back down over it.
- // No other architectures are framepointer-enabled at the moment.
+ // On arm64, the frame pointer should be at the bottom of
+ // the stack (with R29 (aka FP) = RSP), in which case we would
+ // not want to do the subtraction here. But we started out without
+ // any frame pointer, and when we wanted to add it, we didn't
+ // want to break all the assembly doing direct writes to 8(RSP)
+ // to set the first parameter to a called function.
+ // So we decided to write the FP link *below* the stack pointer
+ // (with R29 = RSP - 8 in Go functions).
+ // This is technically ABI-compatible but not standard.
+ // And it happens to end up mimicking the x86 layout.
+ // Other architectures may make different decisions.
if frame.varp > frame.sp && framepointer_enabled {
frame.varp -= goarch.PtrSize
}
gp := u.g.ptr()
if u.flags&(unwindPrintErrors|unwindSilentErrors) == 0 && u.frame.sp != gp.stktopsp {
print("runtime: g", gp.goid, ": frame.sp=", hex(u.frame.sp), " top=", hex(gp.stktopsp), "\n")
- print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "]\n")
+ print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "\n")
throw("traceback did not unwind completely")
}
}
# (CallSize is 32 on ppc64, 8 on amd64 for frame pointer.)
start 96 nosplit
start 100 nosplit; REJECT ppc64 ppc64le
-start 104 nosplit; REJECT ppc64 ppc64le
+start 104 nosplit; REJECT ppc64 ppc64le arm64
start 108 nosplit; REJECT ppc64 ppc64le
start 112 nosplit; REJECT ppc64 ppc64le arm64
start 116 nosplit; REJECT ppc64 ppc64le
# Because AMD64 uses frame pointer, it has 8 fewer bytes.
start 96 nosplit call f; f 0 nosplit
start 100 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le
-start 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le
+start 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le arm64
start 108 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le
start 112 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 arm64
start 116 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64
# Architectures differ in the same way as before.
start 96 nosplit call f; f 0 call f
start 100 nosplit call f; f 0 call f; REJECT ppc64 ppc64le
-start 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64
+start 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64
start 108 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64
start 112 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64
start 116 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64
# Indirect calls are assumed to be splitting functions.
start 96 nosplit callind
start 100 nosplit callind; REJECT ppc64 ppc64le
-start 104 nosplit callind; REJECT ppc64 ppc64le amd64
+start 104 nosplit callind; REJECT ppc64 ppc64le amd64 arm64
start 108 nosplit callind; REJECT ppc64 ppc64le amd64
start 112 nosplit callind; REJECT ppc64 ppc64le amd64 arm64
start 116 nosplit callind; REJECT ppc64 ppc64le amd64