]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/internal/obj: optimize the function stacksplit on loong64
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>
Tue, 3 Sep 2024 12:11:06 +0000 (20:11 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Tue, 8 Oct 2024 13:59:45 +0000 (13:59 +0000)
In the process of stack split checking, loong64 uses the following
logic: if SP > stackguard then goto done, else morestack

The possible problem here is that the probability of morestack
execution is much lower than done, while static branch prediction
is more inclined to obtain morestack, which will cause a certain
probability of branch prediction error.

Change the logic here to:
if SP <= stackguard then goto morestack, else done

benchmarks on 3A6000:

goos: linux
goarch: loong64
pkg: fmt
cpu: Loongson-3A6000 @ 2500.00MHz
                              │  bench.old  │              bench.new              │
                              │   sec/op    │   sec/op     vs base                │
SprintfPadding                  418.3n ± 1%   387.0n ± 0%   -7.49% (p=0.000 n=20)
SprintfEmpty                    35.95n ± 0%   35.86n ± 0%   -0.25% (p=0.000 n=20)
SprintfString                   75.02n ± 1%   72.24n ± 0%   -3.71% (p=0.000 n=20)
SprintfTruncateString           165.7n ± 3%   139.9n ± 1%  -15.58% (p=0.000 n=20)
SprintfTruncateBytes            171.0n ± 0%   147.3n ± 0%  -13.83% (p=0.000 n=20)
SprintfSlowParsingPath          90.56n ± 0%   80.85n ± 0%  -10.72% (p=0.000 n=20)
SprintfQuoteString              560.2n ± 0%   509.7n ± 0%   -9.01% (p=0.000 n=20)
SprintfInt                      58.62n ± 0%   56.45n ± 0%   -3.70% (p=0.000 n=20)
SprintfIntInt                   141.7n ± 0%   122.2n ± 0%  -13.73% (p=0.000 n=20)
SprintfPrefixedInt              210.6n ± 0%   208.8n ± 0%   -0.88% (p=0.000 n=20)
SprintfFloat                    282.3n ± 0%   251.8n ± 1%  -10.80% (p=0.000 n=20)
SprintfComplex                  854.1n ± 0%   813.8n ± 0%   -4.71% (p=0.000 n=20)
SprintfBoolean                  76.32n ± 0%   71.14n ± 1%   -6.79% (p=0.000 n=20)
SprintfHexString                218.5n ± 0%   193.4n ± 0%  -11.51% (p=0.000 n=20)
SprintfHexBytes                 321.3n ± 0%   275.0n ± 0%  -14.42% (p=0.000 n=20)
SprintfBytes                    573.5n ± 0%   553.2n ± 1%   -3.54% (p=0.000 n=20)
SprintfStringer                 501.1n ± 1%   446.6n ± 0%  -10.86% (p=0.000 n=20)
SprintfStructure                1.793µ ± 0%   1.683µ ± 0%   -6.16% (p=0.000 n=20)
ManyArgs                        500.0n ± 0%   470.4n ± 0%   -5.92% (p=0.000 n=20)
FprintInt                       67.51n ± 0%   65.71n ± 0%   -2.66% (p=0.000 n=20)
FprintfBytes                    130.9n ± 0%   129.5n ± 1%   -1.11% (p=0.000 n=20)
FprintIntNoAlloc                67.55n ± 0%   65.80n ± 0%   -2.58% (p=0.000 n=20)
ScanInts                        386.3µ ± 0%   346.5µ ± 0%  -10.29% (p=0.000 n=20)
ScanRecursiveInt                25.97m ± 0%   25.93m ± 0%   -0.15% (p=0.038 n=20)
ScanRecursiveIntReaderWrapper   26.07m ± 0%   25.93m ± 0%   -0.53% (p=0.001 n=20)
geomean                         702.6n        653.7n        -6.96%

goos: linux
goarch: loong64
pkg: test/bench/go1
cpu: Loongson-3A6000 @ 2500.00MHz
                      │  bench.old   │              bench.new              │
                      │    sec/op    │   sec/op     vs base                │
BinaryTree17              7.688 ± 1%    7.724 ± 0%   +0.47% (p=0.040 n=20)
Fannkuch11                2.670 ± 0%    2.645 ± 0%   -0.94% (p=0.000 n=20)
FmtFprintfEmpty          35.93n ± 0%   37.50n ± 0%   +4.37% (p=0.000 n=20)
FmtFprintfString         56.32n ± 0%   59.74n ± 0%   +6.08% (p=0.000 n=20)
FmtFprintfInt            64.47n ± 0%   61.26n ± 0%   -4.98% (p=0.000 n=20)
FmtFprintfIntInt        100.30n ± 0%   99.67n ± 0%   -0.63% (p=0.000 n=20)
FmtFprintfPrefixedInt    116.7n ± 0%   119.3n ± 0%   +2.23% (p=0.000 n=20)
FmtFprintfFloat          234.1n ± 0%   203.4n ± 0%  -13.11% (p=0.000 n=20)
FmtManyArgs              503.0n ± 0%   467.9n ± 0%   -6.96% (p=0.000 n=20)
GobDecode                8.125m ± 0%   7.299m ± 0%  -10.17% (p=0.000 n=20)
GobEncode                8.930m ± 1%   8.581m ± 1%   -3.91% (p=0.000 n=20)
Gzip                     280.0m ± 0%   279.8m ± 0%   -0.10% (p=0.000 n=20)
Gunzip                   33.30m ± 0%   32.48m ± 0%   -2.49% (p=0.000 n=20)
HTTPClientServer         55.43µ ± 0%   54.10µ ± 1%   -2.41% (p=0.000 n=20)
JSONEncode              10.086m ± 0%   9.055m ± 0%  -10.22% (p=0.000 n=20)
JSONDecode               49.37m ± 1%   46.22m ± 1%   -6.40% (p=0.000 n=20)
Mandelbrot200            4.606m ± 0%   4.606m ± 0%        ~ (p=0.280 n=20)
GoParse                  5.010m ± 0%   4.855m ± 0%   -3.09% (p=0.000 n=20)
RegexpMatchEasy0_32      59.09n ± 0%   59.32n ± 0%   +0.39% (p=0.000 n=20)
RegexpMatchEasy0_1K      455.2n ± 0%   453.8n ± 0%   -0.31% (p=0.000 n=20)
RegexpMatchEasy1_32      59.24n ± 0%   60.11n ± 0%   +1.47% (p=0.000 n=20)
RegexpMatchEasy1_1K      555.2n ± 0%   553.9n ± 0%   -0.23% (p=0.000 n=20)
RegexpMatchMedium_32     845.7n ± 0%   775.6n ± 0%   -8.28% (p=0.000 n=20)
RegexpMatchMedium_1K     26.68µ ± 0%   26.48µ ± 0%   -0.78% (p=0.000 n=20)
RegexpMatchHard_32       1.317µ ± 0%   1.326µ ± 0%   +0.68% (p=0.000 n=20)
RegexpMatchHard_1K       41.35µ ± 0%   40.95µ ± 0%   -0.97% (p=0.000 n=20)
Revcomp                  463.0m ± 0%   473.0m ± 0%   +2.15% (p=0.000 n=20)
Template                 83.80m ± 0%   76.26m ± 1%   -9.00% (p=0.000 n=20)
TimeParse                283.3n ± 0%   260.8n ± 0%   -7.96% (p=0.000 n=20)
TimeFormat               307.2n ± 0%   290.5n ± 0%   -5.45% (p=0.000 n=20)
geomean                  53.16µ        51.67µ        -2.79%

Change-Id: Iaec2f50db18e9a2b405605f8b92af3683114ea34
Reviewed-on: https://go-review.googlesource.com/c/go/+/616035
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/internal/obj/loong64/obj.go

index 681802a18d6d47ca10f1f3271a42d3c2c2c1d406..0446fb78a09fbbc2abab94c37abfe29fa144d2e7 100644 (file)
@@ -726,7 +726,7 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
        var q *obj.Prog
        if framesize <= abi.StackSmall {
                // small stack: SP < stackguard
-               //      AGTU    SP, stackguard, R20
+               //      SGTU    SP, stackguard, R20
                p = obj.Appendp(p, c.newprog)
 
                p.As = ASGTU
@@ -784,19 +784,41 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
                p.To.Reg = REG_R20
        }
 
-       // q1: BNE      R20, done
+       // q1: BEQ      R20, morestack
        p = obj.Appendp(p, c.newprog)
        q1 := p
 
-       p.As = ABNE
+       p.As = ABEQ
        p.From.Type = obj.TYPE_REG
        p.From.Reg = REG_R20
        p.To.Type = obj.TYPE_BRANCH
        p.Mark |= BRANCH
 
-       // MOV  LINK, R31
-       p = obj.Appendp(p, c.newprog)
+       end := c.ctxt.EndUnsafePoint(p, c.newprog, -1)
+
+       var last *obj.Prog
+       for last = c.cursym.Func().Text; last.Link != nil; last = last.Link {
+       }
 
+       // Now we are at the end of the function, but logically
+       // we are still in function prologue. We need to fix the
+       // SP data and PCDATA.
+       spfix := obj.Appendp(last, c.newprog)
+       spfix.As = obj.ANOP
+       spfix.Spadj = -framesize
+
+       pcdata := c.ctxt.EmitEntryStackMap(c.cursym, spfix, c.newprog)
+       pcdata = c.ctxt.StartUnsafePoint(pcdata, c.newprog)
+
+       if q != nil {
+               q.To.SetTarget(pcdata)
+       }
+       q1.To.SetTarget(pcdata)
+
+       p = c.cursym.Func().SpillRegisterArgs(pcdata, c.newprog)
+
+       // MOV  LINK, R31
+       p = obj.Appendp(p, c.newprog)
        p.As = mov
        p.From.Type = obj.TYPE_REG
        p.From.Reg = REGLINK
@@ -807,45 +829,32 @@ func (c *ctxt0) stacksplit(p *obj.Prog, framesize int32) *obj.Prog {
                p.Mark |= LABEL
        }
 
-       p = c.ctxt.EmitEntryStackMap(c.cursym, p, c.newprog)
-
-       // Spill the register args that could be clobbered by the
-       // morestack code
-       p = c.cursym.Func().SpillRegisterArgs(p, c.newprog)
+       // JAL runtime.morestack(SB)
+       call := obj.Appendp(p, c.newprog)
+       call.As = AJAL
+       call.To.Type = obj.TYPE_BRANCH
 
-       // JAL  runtime.morestack(SB)
-       p = obj.Appendp(p, c.newprog)
-
-       p.As = AJAL
-       p.To.Type = obj.TYPE_BRANCH
        if c.cursym.CFunc() {
-               p.To.Sym = c.ctxt.Lookup("runtime.morestackc")
+               call.To.Sym = c.ctxt.Lookup("runtime.morestackc")
        } else if !c.cursym.Func().Text.From.Sym.NeedCtxt() {
-               p.To.Sym = c.ctxt.Lookup("runtime.morestack_noctxt")
+               call.To.Sym = c.ctxt.Lookup("runtime.morestack_noctxt")
        } else {
-               p.To.Sym = c.ctxt.Lookup("runtime.morestack")
+               call.To.Sym = c.ctxt.Lookup("runtime.morestack")
        }
-       p.Mark |= BRANCH
-
-       p = c.cursym.Func().UnspillRegisterArgs(p, c.newprog)
-       p = c.ctxt.EndUnsafePoint(p, c.newprog, -1)
+       call.Mark |= BRANCH
 
-       // JMP  start
-       p = obj.Appendp(p, c.newprog)
-
-       p.As = AJMP
-       p.To.Type = obj.TYPE_BRANCH
-       p.To.SetTarget(startPred.Link)
-       startPred.Link.Mark |= LABEL
-       p.Mark |= BRANCH
-
-       // placeholder for q1's jump target
-       p = obj.Appendp(p, c.newprog)
+       // The instructions which unspill regs should be preemptible.
+       pcdata = c.ctxt.EndUnsafePoint(call, c.newprog, -1)
+       unspill := c.cursym.Func().UnspillRegisterArgs(pcdata, c.newprog)
 
-       p.As = obj.ANOP // zero-width place holder
-       q1.To.SetTarget(p)
+       // JMP start
+       jmp := obj.Appendp(unspill, c.newprog)
+       jmp.As = AJMP
+       jmp.To.Type = obj.TYPE_BRANCH
+       jmp.To.SetTarget(startPred.Link)
+       jmp.Spadj = +framesize
 
-       return p
+       return end
 }
 
 func (c *ctxt0) addnop(p *obj.Prog) {