From d7c7d88b2c991c744f3cc1eb5865218f52ef98b0 Mon Sep 17 00:00:00 2001 From: Balaram Makam Date: Mon, 2 Apr 2018 16:22:08 -0400 Subject: [PATCH] cmd/compile: intrinsify math/big.mulWW on ARM64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Performance numbers on amberwing: pkg: math/big name old time/op new time/op delta QuoRem 3.08µs ± 0% 2.93µs ± 1% -4.89% (p=0.008 n=5+5) ModSqrt225_Tonelli 721µs ± 0% 718µs ± 0% -0.46% (p=0.008 n=5+5) ModSqrt224_3Mod4 218µs ± 0% 217µs ± 0% -0.27% (p=0.008 n=5+5) ModSqrt5430_Tonelli 2.91s ± 0% 2.91s ± 0% ~ (p=0.222 n=5+5) ModSqrt5430_3Mod4 970ms ± 0% 970ms ± 0% ~ (p=0.151 n=5+5) Sqrt 45.9µs ± 0% 43.8µs ± 0% -4.63% (p=0.008 n=5+5) IntSqr/1 19.9ns ± 0% 17.3ns ± 0% -13.07% (p=0.008 n=5+5) IntSqr/2 52.6ns ± 0% 50.8ns ± 0% -3.35% (p=0.008 n=5+5) IntSqr/3 70.4ns ± 0% 69.4ns ± 0% ~ (p=0.079 n=4+5) IntSqr/5 103ns ± 0% 99ns ± 0% -3.98% (p=0.008 n=5+5) IntSqr/8 179ns ± 0% 178ns ± 0% -0.56% (p=0.008 n=5+5) IntSqr/10 272ns ± 0% 272ns ± 0% ~ (all equal) IntSqr/20 763ns ± 0% 787ns ± 0% +3.15% (p=0.016 n=5+4) IntSqr/30 1.25µs ± 1% 1.29µs ± 1% +3.27% (p=0.008 n=5+5) IntSqr/50 2.64µs ± 0% 2.71µs ± 0% +2.61% (p=0.008 n=5+5) IntSqr/80 5.67µs ± 0% 5.72µs ± 0% +0.88% (p=0.008 n=5+5) IntSqr/100 8.05µs ± 0% 8.09µs ± 0% +0.45% (p=0.008 n=5+5) IntSqr/200 28.0µs ± 0% 28.1µs ± 0% ~ (p=0.151 n=5+5) IntSqr/300 59.4µs ± 0% 59.6µs ± 0% +0.36% (p=0.008 n=5+5) IntSqr/500 141µs ± 0% 141µs ± 0% +0.08% (p=0.008 n=5+5) IntSqr/800 280µs ± 0% 280µs ± 0% -0.12% (p=0.008 n=5+5) IntSqr/1000 429µs ± 0% 428µs ± 0% -0.27% (p=0.008 n=5+5) pkg: crypto-ecdsa name old time/op new time/op delta SignP384 7.85ms ± 1% 7.61ms ± 1% -3.12% (p=0.008 n=5+5) Change-Id: I1ab30856cc0e570f6312f0bd8914779b55adbc16 Reviewed-on: https://go-review.googlesource.com/104135 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot --- src/cmd/compile/internal/arm64/ssa.go | 15 +++++++++++++++ src/cmd/compile/internal/gc/ssa.go | 2 +- src/cmd/compile/internal/ssa/gen/ARM64.rules | 3 ++- src/cmd/compile/internal/ssa/gen/ARM64Ops.go | 2 ++ src/cmd/compile/internal/ssa/opGen.go | 16 ++++++++++++++++ src/cmd/compile/internal/ssa/rewriteARM64.go | 16 ++++++++++++++++ 6 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index 4984f9a007..b72ead7368 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -413,6 +413,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.Reg = v.Args[0].Reg() p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpARM64LoweredMuluhilo: + r0 := v.Args[0].Reg() + r1 := v.Args[1].Reg() + p := s.Prog(arm64.AUMULH) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r0 + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg0() + p1 := s.Prog(arm64.AMUL) + p1.From.Type = obj.TYPE_REG + p1.From.Reg = r1 + p1.Reg = r0 + p1.To.Type = obj.TYPE_REG + p1.To.Reg = v.Reg1() case ssa.OpARM64LoweredAtomicExchange64, ssa.OpARM64LoweredAtomicExchange32: // LDAXR (Rarg0), Rout diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 7a2de3c8fb..02e8a62467 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -3207,7 +3207,7 @@ func init() { func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1]) }, - sys.ArchAMD64) + sys.ArchAMD64, sys.ArchARM64) add("math/big", "divWW", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2]) diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index 5eaf76cc8c..edeadfd1d2 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -29,6 +29,7 @@ (Hmul64u x y) -> (UMULH x y) (Hmul32 x y) -> (SRAconst (MULL x y) [32]) (Hmul32u x y) -> (SRAconst (UMULL x y) [32]) +(Mul64uhilo x y) -> (LoweredMuluhilo x y) (Div64 x y) -> (DIV x y) (Div64u x y) -> (UDIV x y) @@ -1791,4 +1792,4 @@ (FSUBS a (FNMULS x y)) -> (FMADDS a x y) (FSUBD a (FNMULD x y)) -> (FMADDD a x y) (FSUBS (FNMULS x y) a) -> (FNMADDS a x y) -(FSUBD (FNMULD x y) a) -> (FNMADDD a x y) \ No newline at end of file +(FSUBD (FNMULD x y) a) -> (FNMADDD a x y) diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go index b311359721..ec75ca38c6 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go @@ -142,6 +142,7 @@ func init() { gp21nog = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}} gp2flags = regInfo{inputs: []regMask{gpg, gpg}} gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}} + gp22 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}} gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}} gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}} gpstore0 = regInfo{inputs: []regMask{gpspsbg}} @@ -203,6 +204,7 @@ func init() { {name: "EON", argLength: 2, reg: gp21, asm: "EON"}, // arg0 ^ ^arg1 {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // arg0 | ^arg1 + {name: "LoweredMuluhilo", argLength: 2, reg: gp22, resultNotInArgs: true}, // arg0 * arg1, returns (hi, lo) // unary ops {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0 {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0 diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 32c595382a..ef3875ec02 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1050,6 +1050,7 @@ const ( OpARM64BIC OpARM64EON OpARM64ORN + OpARM64LoweredMuluhilo OpARM64MVN OpARM64NEG OpARM64FNEGS @@ -13578,6 +13579,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "LoweredMuluhilo", + argLen: 2, + resultNotInArgs: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 + {1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 + }, + outputs: []outputInfo{ + {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + {1, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + }, + }, + }, { name: "MVN", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 9508b46072..407719e744 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -591,6 +591,8 @@ func rewriteValueARM64(v *Value) bool { return rewriteValueARM64_OpMul64_0(v) case OpMul64F: return rewriteValueARM64_OpMul64F_0(v) + case OpMul64uhilo: + return rewriteValueARM64_OpMul64uhilo_0(v) case OpMul8: return rewriteValueARM64_OpMul8_0(v) case OpNeg16: @@ -18906,6 +18908,20 @@ func rewriteValueARM64_OpMul64F_0(v *Value) bool { return true } } +func rewriteValueARM64_OpMul64uhilo_0(v *Value) bool { + // match: (Mul64uhilo x y) + // cond: + // result: (LoweredMuluhilo x y) + for { + _ = v.Args[1] + x := v.Args[0] + y := v.Args[1] + v.reset(OpARM64LoweredMuluhilo) + v.AddArg(x) + v.AddArg(y) + return true + } +} func rewriteValueARM64_OpMul8_0(v *Value) bool { // match: (Mul8 x y) // cond: -- 2.50.0