From f66581ead6f6dbc94b1d2fddee11abff722039e1 Mon Sep 17 00:00:00 2001 From: qmuntal Date: Thu, 19 Jan 2023 16:17:56 +0100 Subject: [PATCH] cmd/internal/obj/x86: use push/pop instead of mov to store/load FP This CL changes how the x86 compiler stores and loads the frame pointer on each function prologue and epilogue, with the goal to reduce the final binary size without affecting performance. The compiler is currently using MOV instructions to load and store BP, which can take from 5 to 8 bytes each. This CL changes this approach so it emits PUSH/POP instructions instead, which always take only 1 byte each (when operating with BP). It can also avoid using the SUBQ/ADDQ to grow the stack for functions that have frame pointer but does not have local variables. On Windows, this CL reduces the go toolchain size from 15,697,920 bytes to 15,584,768 bytes, a reduction of 0.7%. Example of epilog and prologue for a function with 0x10 bytes of local variables: Before === SUBQ $0x18, SP MOVQ BP, 0x10(SP) LEAQ 0x10(SP), BP ... function body ... MOVQ 0x10(SP), BP ADDQ $0x18, SP RET === After === PUSHQ BP LEAQ 0(SP), BP SUBQ $0x10, SP ... function body ... MOVQ ADDQ $0x10, SP POPQ BP RET === Updates #6853 Change-Id: Ice9e14bbf8dff083c5f69feb97e9a764c3ca7785 Reviewed-on: https://go-review.googlesource.com/c/go/+/462300 Reviewed-by: Keith Randall Reviewed-by: Cherry Mui TryBot-Result: Gopher Robot Reviewed-by: Keith Randall Run-TryBot: Quim Muntal --- src/cmd/internal/obj/x86/obj6.go | 70 +++++++++++++++----------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/src/cmd/internal/obj/x86/obj6.go b/src/cmd/internal/obj/x86/obj6.go index aa4cc225c6..33c02d59e8 100644 --- a/src/cmd/internal/obj/x86/obj6.go +++ b/src/cmd/internal/obj/x86/obj6.go @@ -684,37 +684,13 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p, regg = loadG(ctxt, cursym, p, newprog) } - // Delve debugger would like the next instruction to be noted as the end of the function prologue. - // TODO: are there other cases (e.g., wrapper functions) that need marking? - markedPrologue := false - - if autoffset != 0 { - if autoffset%int32(ctxt.Arch.RegSize) != 0 { - ctxt.Diag("unaligned stack size %d", autoffset) - } - p = obj.Appendp(p, newprog) - p.As = AADJSP - p.From.Type = obj.TYPE_CONST - p.From.Offset = int64(autoffset) - p.Spadj = autoffset - p.Pos = p.Pos.WithXlogue(src.PosPrologueEnd) - markedPrologue = true - } - if bpsize > 0 { // Save caller's BP p = obj.Appendp(p, newprog) - p.As = AMOVQ + p.As = APUSHQ p.From.Type = obj.TYPE_REG p.From.Reg = REG_BP - p.To.Type = obj.TYPE_MEM - p.To.Reg = REG_SP - p.To.Scale = 1 - p.To.Offset = int64(autoffset) - int64(bpsize) - if !markedPrologue { - p.Pos = p.Pos.WithXlogue(src.PosPrologueEnd) - } // Move current frame to BP p = obj.Appendp(p, newprog) @@ -723,11 +699,32 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.From.Type = obj.TYPE_MEM p.From.Reg = REG_SP p.From.Scale = 1 - p.From.Offset = int64(autoffset) - int64(bpsize) + p.From.Offset = 0 p.To.Type = obj.TYPE_REG p.To.Reg = REG_BP } + if autoffset%int32(ctxt.Arch.RegSize) != 0 { + ctxt.Diag("unaligned stack size %d", autoffset) + } + + // localoffset is autoffset discounting the frame pointer, + // which has already been allocated in the stack. + localoffset := autoffset - int32(bpsize) + if localoffset != 0 { + p = obj.Appendp(p, newprog) + p.As = AADJSP + p.From.Type = obj.TYPE_CONST + p.From.Offset = int64(localoffset) + p.Spadj = localoffset + } + + // Delve debugger would like the next instruction to be noted as the end of the function prologue. + // TODO: are there other cases (e.g., wrapper functions) that need marking? + if autoffset != 0 { + p.Pos = p.Pos.WithXlogue(src.PosPrologueEnd) + } + if cursym.Func().Text.From.Sym.Wrapper() { // if g._panic != nil && g._panic.argp == FP { // g._panic.argp = bottom-of-frame @@ -933,24 +930,23 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { if autoffset != 0 { to := p.To // Keep To attached to RET for retjmp below p.To = obj.Addr{} + if localoffset != 0 { + p.As = AADJSP + p.From.Type = obj.TYPE_CONST + p.From.Offset = int64(-localoffset) + p.Spadj = -localoffset + p = obj.Appendp(p, newprog) + } + if bpsize > 0 { // Restore caller's BP - p.As = AMOVQ - - p.From.Type = obj.TYPE_MEM - p.From.Reg = REG_SP - p.From.Scale = 1 - p.From.Offset = int64(autoffset) - int64(bpsize) + p.As = APOPQ p.To.Type = obj.TYPE_REG p.To.Reg = REG_BP + p.Spadj = -int32(bpsize) p = obj.Appendp(p, newprog) } - p.As = AADJSP - p.From.Type = obj.TYPE_CONST - p.From.Offset = int64(-autoffset) - p.Spadj = -autoffset - p = obj.Appendp(p, newprog) p.As = obj.ARET p.To = to -- 2.48.1