]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue
authorPhilip Hofer <phofer@umich.edu>
Tue, 28 Feb 2017 22:35:37 +0000 (14:35 -0800)
committerJosh Bleecher Snyder <josharian@gmail.com>
Thu, 2 Mar 2017 05:15:32 +0000 (05:15 +0000)
This is a follow-up to CL 36893.

Move the unlikely branch in the wrapper prologue to the end
of the function, where it has minimal impact on the instruction
cache. Static branch prediction is also less likely to choose
a forward branch.

Updates #19042

sort benchmarks:
name                  old time/op  new time/op  delta
SearchWrappers-4      1.44µs ± 0%  1.45µs ± 0%  +1.15%  (p=0.000 n=9+10)
SortString1K-4        1.02ms ± 0%  1.04ms ± 0%  +2.39%  (p=0.000 n=10+10)
SortString1K_Slice-4   960µs ± 0%   989µs ± 0%  +2.95%  (p=0.000 n=9+10)
StableString1K-4       218µs ± 0%   213µs ± 0%  -2.13%  (p=0.000 n=10+10)
SortInt1K-4            541µs ± 0%   543µs ± 0%  +0.30%  (p=0.003 n=9+10)
StableInt1K-4          760µs ± 1%   763µs ± 1%  +0.38%  (p=0.011 n=10+10)
StableInt1K_Slice-4    840µs ± 1%   779µs ± 0%  -7.31%  (p=0.000 n=9+10)
SortInt64K-4          55.2ms ± 0%  55.4ms ± 1%  +0.34%  (p=0.012 n=10+8)
SortInt64K_Slice-4    56.2ms ± 0%  55.6ms ± 1%  -1.16%  (p=0.000 n=10+10)
StableInt64K-4        70.9ms ± 1%  71.0ms ± 0%    ~     (p=0.315 n=10+7)
Sort1e2-4              250µs ± 0%   249µs ± 1%    ~     (p=0.315 n=9+10)
Stable1e2-4            600µs ± 0%   594µs ± 0%  -1.09%  (p=0.000 n=9+10)
Sort1e4-4             51.2ms ± 0%  51.4ms ± 1%  +0.40%  (p=0.001 n=9+10)
Stable1e4-4            204ms ± 1%   199ms ± 1%  -2.27%  (p=0.000 n=10+10)
Sort1e6-4              8.42s ± 0%   8.44s ± 0%  +0.28%  (p=0.000 n=8+9)
Stable1e6-4            43.3s ± 0%   42.5s ± 1%  -1.89%  (p=0.000 n=9+9)

Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e
Reviewed-on: https://go-review.googlesource.com/37611
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
src/cmd/internal/obj/arm/obj5.go

index 75d22bcf810a05dc50e2bfbd03d6f3f74c24d8e5..52a17e9fd890bc039688026d53468478e3a27cae 100644 (file)
@@ -341,8 +341,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
                q = p
        }
 
-       var p1 *obj.Prog
-       var p2 *obj.Prog
        var q2 *obj.Prog
        for p := cursym.Text; p != nil; p = p.Link {
                o := p.As
@@ -391,22 +389,24 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
                                // if(g->panic != nil && g->panic->argp == FP) g->panic->argp = bottom-of-frame
                                //
                                //      MOVW g_panic(g), R1
-                               //      CMP $0, R1
-                               //      B.EQ end
+                               //      CMP  $0, R1
+                               //      B.NE checkargp
+                               // end:
+                               //      NOP
+                               // ... function ...
+                               // checkargp:
                                //      MOVW panic_argp(R1), R2
-                               //      ADD $(autosize+4), R13, R3
-                               //      CMP R2, R3
+                               //      ADD  $(autosize+4), R13, R3
+                               //      CMP  R2, R3
                                //      B.NE end
-                               //      ADD $4, R13, R4
+                               //      ADD  $4, R13, R4
                                //      MOVW R4, panic_argp(R1)
-                               // end:
-                               //      NOP
+                               //      B    end
                                //
                                // The NOP is needed to give the jumps somewhere to land.
                                // It is a liblink NOP, not an ARM NOP: it encodes to 0 instruction bytes.
 
                                p = obj.Appendp(ctxt, p)
-
                                p.As = AMOVW
                                p.From.Type = obj.TYPE_MEM
                                p.From.Reg = REGG
@@ -420,20 +420,34 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
                                p.From.Offset = 0
                                p.Reg = REG_R1
 
-                               p = obj.Appendp(ctxt, p)
-                               p.As = ABEQ
-                               p.To.Type = obj.TYPE_BRANCH
-                               p1 = p
+                               // B.NE checkargp
+                               bne := obj.Appendp(ctxt, p)
+                               bne.As = ABNE
+                               bne.To.Type = obj.TYPE_BRANCH
 
-                               p = obj.Appendp(ctxt, p)
-                               p.As = AMOVW
-                               p.From.Type = obj.TYPE_MEM
-                               p.From.Reg = REG_R1
-                               p.From.Offset = 0 // Panic.argp
-                               p.To.Type = obj.TYPE_REG
-                               p.To.Reg = REG_R2
+                               // end: NOP
+                               end := obj.Appendp(ctxt, bne)
+                               end.As = obj.ANOP
 
-                               p = obj.Appendp(ctxt, p)
+                               // find end of function
+                               var last *obj.Prog
+                               for last = end; last.Link != nil; last = last.Link {
+                               }
+
+                               // MOVW panic_argp(R1), R2
+                               mov := obj.Appendp(ctxt, last)
+                               mov.As = AMOVW
+                               mov.From.Type = obj.TYPE_MEM
+                               mov.From.Reg = REG_R1
+                               mov.From.Offset = 0 // Panic.argp
+                               mov.To.Type = obj.TYPE_REG
+                               mov.To.Reg = REG_R2
+
+                               // B.NE branch target is MOVW above
+                               bne.Pcond = mov
+
+                               // ADD $(autosize+4), R13, R3
+                               p = obj.Appendp(ctxt, mov)
                                p.As = AADD
                                p.From.Type = obj.TYPE_CONST
                                p.From.Offset = int64(autosize) + 4
@@ -441,17 +455,20 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
                                p.To.Type = obj.TYPE_REG
                                p.To.Reg = REG_R3
 
+                               // CMP R2, R3
                                p = obj.Appendp(ctxt, p)
                                p.As = ACMP
                                p.From.Type = obj.TYPE_REG
                                p.From.Reg = REG_R2
                                p.Reg = REG_R3
 
+                               // B.NE end
                                p = obj.Appendp(ctxt, p)
                                p.As = ABNE
                                p.To.Type = obj.TYPE_BRANCH
-                               p2 = p
+                               p.Pcond = end
 
+                               // ADD $4, R13, R4
                                p = obj.Appendp(ctxt, p)
                                p.As = AADD
                                p.From.Type = obj.TYPE_CONST
@@ -460,6 +477,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
                                p.To.Type = obj.TYPE_REG
                                p.To.Reg = REG_R4
 
+                               // MOVW R4, panic_argp(R1)
                                p = obj.Appendp(ctxt, p)
                                p.As = AMOVW
                                p.From.Type = obj.TYPE_REG
@@ -468,11 +486,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
                                p.To.Reg = REG_R1
                                p.To.Offset = 0 // Panic.argp
 
+                               // B end
                                p = obj.Appendp(ctxt, p)
+                               p.As = AB
+                               p.To.Type = obj.TYPE_BRANCH
+                               p.Pcond = end
 
-                               p.As = obj.ANOP
-                               p1.Pcond = p
-                               p2.Pcond = p
+                               // reset for subsequent passes
+                               p = end
                        }
 
                case obj.ARET:
@@ -702,7 +723,7 @@ func softfloat(ctxt *obj.Link, cursym *obj.LSym) {
 }
 
 func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
-       // MOVW                 g_stackguard(g), R1
+       // MOVW g_stackguard(g), R1
        p = obj.Appendp(ctxt, p)
 
        p.As = AMOVW
@@ -748,11 +769,11 @@ func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
                //      SP-stackguard+StackGuard < framesize + (StackGuard-StackSmall)
                // The +StackGuard on both sides is required to keep the left side positive:
                // SP is allowed to be slightly below stackguard. See stack.h.
-               //      CMP $StackPreempt, R1
+               //      CMP     $StackPreempt, R1
                //      MOVW.NE $StackGuard(SP), R2
-               //      SUB.NE R1, R2
+               //      SUB.NE  R1, R2
                //      MOVW.NE $(framesize+(StackGuard-StackSmall)), R3
-               //      CMP.NE R3, R2
+               //      CMP.NE  R3, R2
                p = obj.Appendp(ctxt, p)
 
                p.As = ACMP