]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: use MOVBQZX for OpAMD64LoweredHasCPUFeature
authorJosh Bleecher Snyder <josharian@gmail.com>
Sun, 5 Apr 2020 02:22:28 +0000 (19:22 -0700)
committerJosh Bleecher Snyder <josharian@gmail.com>
Tue, 7 Apr 2020 18:19:55 +0000 (18:19 +0000)
In the commit message of CL 212360, I wrote:

> This new intrinsic ... generates MOVB+TESTB+NE.
> (It is possible that MOVBQZX+TESTQ+NE would be better.)

I should have tested. MOVBQZX+TESTQ+NE does in fact appear to be better.

For the benchmark in #36196, on my machine:

name      old time/op  new time/op  delta
FMA-8     0.86ns ± 6%  0.70ns ± 5%  -18.79%  (p=0.000 n=98+97)
NonFMA-8  0.61ns ± 5%  0.60ns ± 4%   -0.74%  (p=0.001 n=100+97)

Interestingly, these are both considerably faster than
the measurements I took a couple of months ago (1.4ns/2ns).
It appears that CL 219131 (clearing VZEROUPPER in asyncPreempt) helped a lot.
And FMA is now once again slower than NonFMA, although this change
helps it regain some ground.

Updates #15808
Updates #36351
Updates #36196

Change-Id: I8a326289a963b1939aaa7eaa2fab2ec536467c7d
Reviewed-on: https://go-review.googlesource.com/c/go/+/227238
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/gen/AMD64Ops.go
src/cmd/compile/internal/ssa/rewriteAMD64.go

index 4ce81592f47b98640dfefc2f3ef4d192a193c880..2314db520c8312acc163816d822a9a76acae1447 100644 (file)
@@ -903,7 +903,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.From.Reg = v.Args[0].Reg()
                gc.AddrAuto(&p.To, v)
        case ssa.OpAMD64LoweredHasCPUFeature:
-               p := s.Prog(x86.AMOVB)
+               p := s.Prog(x86.AMOVBQZX)
                p.From.Type = obj.TYPE_MEM
                gc.AddAux(&p.From, v)
                p.To.Type = obj.TYPE_REG
index 7a2c148699b2926d1e4621e0d5085d2b9cc3fc56..f21e1d8bf74f87dbcae429705b80e47b997a125f 100644 (file)
 (GetClosurePtr ...) -> (LoweredGetClosurePtr ...)
 (GetCallerPC ...) -> (LoweredGetCallerPC ...)
 (GetCallerSP ...) -> (LoweredGetCallerSP ...)
-(HasCPUFeature ...) -> (LoweredHasCPUFeature ...)
+
+(HasCPUFeature {s}) -> (SETNE (CMPQconst [0] (LoweredHasCPUFeature {s})))
 (Addr ...) -> (LEAQ ...)
 (LocalAddr {sym} base _) -> (LEAQ {sym} base)
 
index b32f12341803ad246fda71ec85262b5d00dabcd7..be4a0bf805d86114037d2bf8730b0a1b832992c5 100644 (file)
@@ -738,7 +738,7 @@ func init() {
                // It saves all GP registers if necessary, but may clobber others.
                {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},
 
-               {name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "bool", aux: "Sym", symEffect: "None"},
+               {name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "UInt64", aux: "Sym", symEffect: "None"},
 
                // There are three of these functions so that they can have three different register inputs.
                // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
index 1f147c9eb5c87896362b20708b189c09febd57ba..ce802a97c58310908867066f9b221f7b62e204f4 100644 (file)
@@ -787,8 +787,7 @@ func rewriteValueAMD64(v *Value) bool {
        case OpGreater64F:
                return rewriteValueAMD64_OpGreater64F(v)
        case OpHasCPUFeature:
-               v.Op = OpAMD64LoweredHasCPUFeature
-               return true
+               return rewriteValueAMD64_OpHasCPUFeature(v)
        case OpHmul32:
                v.Op = OpAMD64HMULL
                return true
@@ -29924,6 +29923,23 @@ func rewriteValueAMD64_OpGreater64F(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpHasCPUFeature(v *Value) bool {
+       b := v.Block
+       typ := &b.Func.Config.Types
+       // match: (HasCPUFeature {s})
+       // result: (SETNE (CMPQconst [0] (LoweredHasCPUFeature {s})))
+       for {
+               s := v.Aux
+               v.reset(OpAMD64SETNE)
+               v0 := b.NewValue0(v.Pos, OpAMD64CMPQconst, types.TypeFlags)
+               v0.AuxInt = 0
+               v1 := b.NewValue0(v.Pos, OpAMD64LoweredHasCPUFeature, typ.UInt64)
+               v1.Aux = s
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpIsInBounds(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]