From 5923a97f43bd7b8910fa69e3c02cdef2c531cdcf Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Tue, 3 Sep 2024 20:11:06 +0800 Subject: [PATCH] cmd/internal/obj: optimize the function stacksplit on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In the process of stack split checking, loong64 uses the following logic: if SP > stackguard then goto done, else morestack The possible problem here is that the probability of morestack execution is much lower than done, while static branch prediction is more inclined to obtain morestack, which will cause a certain probability of branch prediction error. Change the logic here to: if SP <= stackguard then goto morestack, else done benchmarks on 3A6000: goos: linux goarch: loong64 pkg: fmt cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ SprintfPadding 418.3n ± 1% 387.0n ± 0% -7.49% (p=0.000 n=20) SprintfEmpty 35.95n ± 0% 35.86n ± 0% -0.25% (p=0.000 n=20) SprintfString 75.02n ± 1% 72.24n ± 0% -3.71% (p=0.000 n=20) SprintfTruncateString 165.7n ± 3% 139.9n ± 1% -15.58% (p=0.000 n=20) SprintfTruncateBytes 171.0n ± 0% 147.3n ± 0% -13.83% (p=0.000 n=20) SprintfSlowParsingPath 90.56n ± 0% 80.85n ± 0% -10.72% (p=0.000 n=20) SprintfQuoteString 560.2n ± 0% 509.7n ± 0% -9.01% (p=0.000 n=20) SprintfInt 58.62n ± 0% 56.45n ± 0% -3.70% (p=0.000 n=20) SprintfIntInt 141.7n ± 0% 122.2n ± 0% -13.73% (p=0.000 n=20) SprintfPrefixedInt 210.6n ± 0% 208.8n ± 0% -0.88% (p=0.000 n=20) SprintfFloat 282.3n ± 0% 251.8n ± 1% -10.80% (p=0.000 n=20) SprintfComplex 854.1n ± 0% 813.8n ± 0% -4.71% (p=0.000 n=20) SprintfBoolean 76.32n ± 0% 71.14n ± 1% -6.79% (p=0.000 n=20) SprintfHexString 218.5n ± 0% 193.4n ± 0% -11.51% (p=0.000 n=20) SprintfHexBytes 321.3n ± 0% 275.0n ± 0% -14.42% (p=0.000 n=20) SprintfBytes 573.5n ± 0% 553.2n ± 1% -3.54% (p=0.000 n=20) SprintfStringer 501.1n ± 1% 446.6n ± 0% -10.86% (p=0.000 n=20) SprintfStructure 1.793µ ± 0% 1.683µ ± 0% -6.16% (p=0.000 n=20) ManyArgs 500.0n ± 0% 470.4n ± 0% -5.92% (p=0.000 n=20) FprintInt 67.51n ± 0% 65.71n ± 0% -2.66% (p=0.000 n=20) FprintfBytes 130.9n ± 0% 129.5n ± 1% -1.11% (p=0.000 n=20) FprintIntNoAlloc 67.55n ± 0% 65.80n ± 0% -2.58% (p=0.000 n=20) ScanInts 386.3µ ± 0% 346.5µ ± 0% -10.29% (p=0.000 n=20) ScanRecursiveInt 25.97m ± 0% 25.93m ± 0% -0.15% (p=0.038 n=20) ScanRecursiveIntReaderWrapper 26.07m ± 0% 25.93m ± 0% -0.53% (p=0.001 n=20) geomean 702.6n 653.7n -6.96% goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ BinaryTree17 7.688 ± 1% 7.724 ± 0% +0.47% (p=0.040 n=20) Fannkuch11 2.670 ± 0% 2.645 ± 0% -0.94% (p=0.000 n=20) FmtFprintfEmpty 35.93n ± 0% 37.50n ± 0% +4.37% (p=0.000 n=20) FmtFprintfString 56.32n ± 0% 59.74n ± 0% +6.08% (p=0.000 n=20) FmtFprintfInt 64.47n ± 0% 61.26n ± 0% -4.98% (p=0.000 n=20) FmtFprintfIntInt 100.30n ± 0% 99.67n ± 0% -0.63% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.3n ± 0% +2.23% (p=0.000 n=20) FmtFprintfFloat 234.1n ± 0% 203.4n ± 0% -13.11% (p=0.000 n=20) FmtManyArgs 503.0n ± 0% 467.9n ± 0% -6.96% (p=0.000 n=20) GobDecode 8.125m ± 0% 7.299m ± 0% -10.17% (p=0.000 n=20) GobEncode 8.930m ± 1% 8.581m ± 1% -3.91% (p=0.000 n=20) Gzip 280.0m ± 0% 279.8m ± 0% -0.10% (p=0.000 n=20) Gunzip 33.30m ± 0% 32.48m ± 0% -2.49% (p=0.000 n=20) HTTPClientServer 55.43µ ± 0% 54.10µ ± 1% -2.41% (p=0.000 n=20) JSONEncode 10.086m ± 0% 9.055m ± 0% -10.22% (p=0.000 n=20) JSONDecode 49.37m ± 1% 46.22m ± 1% -6.40% (p=0.000 n=20) Mandelbrot200 4.606m ± 0% 4.606m ± 0% ~ (p=0.280 n=20) GoParse 5.010m ± 0% 4.855m ± 0% -3.09% (p=0.000 n=20) RegexpMatchEasy0_32 59.09n ± 0% 59.32n ± 0% +0.39% (p=0.000 n=20) RegexpMatchEasy0_1K 455.2n ± 0% 453.8n ± 0% -0.31% (p=0.000 n=20) RegexpMatchEasy1_32 59.24n ± 0% 60.11n ± 0% +1.47% (p=0.000 n=20) RegexpMatchEasy1_1K 555.2n ± 0% 553.9n ± 0% -0.23% (p=0.000 n=20) RegexpMatchMedium_32 845.7n ± 0% 775.6n ± 0% -8.28% (p=0.000 n=20) RegexpMatchMedium_1K 26.68µ ± 0% 26.48µ ± 0% -0.78% (p=0.000 n=20) RegexpMatchHard_32 1.317µ ± 0% 1.326µ ± 0% +0.68% (p=0.000 n=20) RegexpMatchHard_1K 41.35µ ± 0% 40.95µ ± 0% -0.97% (p=0.000 n=20) Revcomp 463.0m ± 0% 473.0m ± 0% +2.15% (p=0.000 n=20) Template 83.80m ± 0% 76.26m ± 1% -9.00% (p=0.000 n=20) TimeParse 283.3n ± 0% 260.8n ± 0% -7.96% (p=0.000 n=20) TimeFormat 307.2n ± 0% 290.5n ± 0% -5.45% (p=0.000 n=20) geomean 53.16µ 51.67µ -2.79% Change-Id: Iaec2f50db18e9a2b405605f8b92af3683114ea34 Reviewed-on: https://go-review.googlesource.com/c/go/+/616035 Reviewed-by: Carlos Amedee Reviewed-by: abner chenc Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/cmd/internal/obj/loong64/obj.go | 81 ++++++++++++++++------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/src/cmd/internal/obj/loong64/obj.go b/src/cmd/internal/obj/loong64/obj.go index 681802a18d..0446fb78a0 100644 --- a/src/cmd/internal/obj/loong64/obj.go +++ b/src/cmd/internal/obj/loong64/obj.go @@ -726,7 +726,7 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog { var q *obj.Prog if framesize <= abi.StackSmall { // small stack: SP < stackguard - // AGTU SP, stackguard, R20 + // SGTU SP, stackguard, R20 p = obj.Appendp(p, c.newprog) p.As = ASGTU @@ -784,19 +784,41 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog { p.To.Reg = REG_R20 } - // q1: BNE R20, done + // q1: BEQ R20, morestack p = obj.Appendp(p, c.newprog) q1 := p - p.As = ABNE + p.As = ABEQ p.From.Type = obj.TYPE_REG p.From.Reg = REG_R20 p.To.Type = obj.TYPE_BRANCH p.Mark |= BRANCH - // MOV LINK, R31 - p = obj.Appendp(p, c.newprog) + end := c.ctxt.EndUnsafePoint(p, c.newprog, -1) + + var last *obj.Prog + for last = c.cursym.Func().Text; last.Link != nil; last = last.Link { + } + // Now we are at the end of the function, but logically + // we are still in function prologue. We need to fix the + // SP data and PCDATA. + spfix := obj.Appendp(last, c.newprog) + spfix.As = obj.ANOP + spfix.Spadj = -framesize + + pcdata := c.ctxt.EmitEntryStackMap(c.cursym, spfix, c.newprog) + pcdata = c.ctxt.StartUnsafePoint(pcdata, c.newprog) + + if q != nil { + q.To.SetTarget(pcdata) + } + q1.To.SetTarget(pcdata) + + p = c.cursym.Func().SpillRegisterArgs(pcdata, c.newprog) + + // MOV LINK, R31 + p = obj.Appendp(p, c.newprog) p.As = mov p.From.Type = obj.TYPE_REG p.From.Reg = REGLINK @@ -807,45 +829,32 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog { p.Mark |= LABEL } - p = c.ctxt.EmitEntryStackMap(c.cursym, p, c.newprog) - - // Spill the register args that could be clobbered by the - // morestack code - p = c.cursym.Func().SpillRegisterArgs(p, c.newprog) + // JAL runtime.morestack(SB) + call := obj.Appendp(p, c.newprog) + call.As = AJAL + call.To.Type = obj.TYPE_BRANCH - // JAL runtime.morestack(SB) - p = obj.Appendp(p, c.newprog) - - p.As = AJAL - p.To.Type = obj.TYPE_BRANCH if c.cursym.CFunc() { - p.To.Sym = c.ctxt.Lookup("runtime.morestackc") + call.To.Sym = c.ctxt.Lookup("runtime.morestackc") } else if !c.cursym.Func().Text.From.Sym.NeedCtxt() { - p.To.Sym = c.ctxt.Lookup("runtime.morestack_noctxt") + call.To.Sym = c.ctxt.Lookup("runtime.morestack_noctxt") } else { - p.To.Sym = c.ctxt.Lookup("runtime.morestack") + call.To.Sym = c.ctxt.Lookup("runtime.morestack") } - p.Mark |= BRANCH - - p = c.cursym.Func().UnspillRegisterArgs(p, c.newprog) - p = c.ctxt.EndUnsafePoint(p, c.newprog, -1) + call.Mark |= BRANCH - // JMP start - p = obj.Appendp(p, c.newprog) - - p.As = AJMP - p.To.Type = obj.TYPE_BRANCH - p.To.SetTarget(startPred.Link) - startPred.Link.Mark |= LABEL - p.Mark |= BRANCH - - // placeholder for q1's jump target - p = obj.Appendp(p, c.newprog) + // The instructions which unspill regs should be preemptible. + pcdata = c.ctxt.EndUnsafePoint(call, c.newprog, -1) + unspill := c.cursym.Func().UnspillRegisterArgs(pcdata, c.newprog) - p.As = obj.ANOP // zero-width place holder - q1.To.SetTarget(p) + // JMP start + jmp := obj.Appendp(unspill, c.newprog) + jmp.As = AJMP + jmp.To.Type = obj.TYPE_BRANCH + jmp.To.SetTarget(startPred.Link) + jmp.Spadj = +framesize - return p + return end } func (c *ctxt0) addnop(p *obj.Prog) { -- 2.48.1