From: Josh Bleecher Snyder Date: Sun, 5 Apr 2020 02:22:28 +0000 (-0700) Subject: cmd/compile: use MOVBQZX for OpAMD64LoweredHasCPUFeature X-Git-Tag: go1.15beta1~645 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=7ee8467b276fbe442df8c84c3d13a99e80519c24;p=gostls13.git cmd/compile: use MOVBQZX for OpAMD64LoweredHasCPUFeature In the commit message of CL 212360, I wrote: > This new intrinsic ... generates MOVB+TESTB+NE. > (It is possible that MOVBQZX+TESTQ+NE would be better.) I should have tested. MOVBQZX+TESTQ+NE does in fact appear to be better. For the benchmark in #36196, on my machine: name old time/op new time/op delta FMA-8 0.86ns ± 6% 0.70ns ± 5% -18.79% (p=0.000 n=98+97) NonFMA-8 0.61ns ± 5% 0.60ns ± 4% -0.74% (p=0.001 n=100+97) Interestingly, these are both considerably faster than the measurements I took a couple of months ago (1.4ns/2ns). It appears that CL 219131 (clearing VZEROUPPER in asyncPreempt) helped a lot. And FMA is now once again slower than NonFMA, although this change helps it regain some ground. Updates #15808 Updates #36351 Updates #36196 Change-Id: I8a326289a963b1939aaa7eaa2fab2ec536467c7d Reviewed-on: https://go-review.googlesource.com/c/go/+/227238 Run-TryBot: Josh Bleecher Snyder TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 4ce81592f4..2314db520c 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -903,7 +903,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.From.Reg = v.Args[0].Reg() gc.AddrAuto(&p.To, v) case ssa.OpAMD64LoweredHasCPUFeature: - p := s.Prog(x86.AMOVB) + p := s.Prog(x86.AMOVBQZX) p.From.Type = obj.TYPE_MEM gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index 7a2c148699..f21e1d8bf7 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -478,7 +478,8 @@ (GetClosurePtr ...) -> (LoweredGetClosurePtr ...) (GetCallerPC ...) -> (LoweredGetCallerPC ...) (GetCallerSP ...) -> (LoweredGetCallerSP ...) -(HasCPUFeature ...) -> (LoweredHasCPUFeature ...) + +(HasCPUFeature {s}) -> (SETNE (CMPQconst [0] (LoweredHasCPUFeature {s}))) (Addr ...) -> (LEAQ ...) (LocalAddr {sym} base _) -> (LEAQ {sym} base) diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go index b32f123418..be4a0bf805 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go @@ -738,7 +738,7 @@ func init() { // It saves all GP registers if necessary, but may clobber others. {name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"}, - {name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "bool", aux: "Sym", symEffect: "None"}, + {name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "UInt64", aux: "Sym", symEffect: "None"}, // There are three of these functions so that they can have three different register inputs. // When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 1f147c9eb5..ce802a97c5 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -787,8 +787,7 @@ func rewriteValueAMD64(v *Value) bool { case OpGreater64F: return rewriteValueAMD64_OpGreater64F(v) case OpHasCPUFeature: - v.Op = OpAMD64LoweredHasCPUFeature - return true + return rewriteValueAMD64_OpHasCPUFeature(v) case OpHmul32: v.Op = OpAMD64HMULL return true @@ -29924,6 +29923,23 @@ func rewriteValueAMD64_OpGreater64F(v *Value) bool { return true } } +func rewriteValueAMD64_OpHasCPUFeature(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (HasCPUFeature {s}) + // result: (SETNE (CMPQconst [0] (LoweredHasCPUFeature {s}))) + for { + s := v.Aux + v.reset(OpAMD64SETNE) + v0 := b.NewValue0(v.Pos, OpAMD64CMPQconst, types.TypeFlags) + v0.AuxInt = 0 + v1 := b.NewValue0(v.Pos, OpAMD64LoweredHasCPUFeature, typ.UInt64) + v1.Aux = s + v0.AddArg(v1) + v.AddArg(v0) + return true + } +} func rewriteValueAMD64_OpIsInBounds(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0]