From c683ab8128e3d26a9894f315793b32fb80e4b5b3 Mon Sep 17 00:00:00 2001 From: Ben Shi Date: Fri, 2 Aug 2019 02:41:59 +0000 Subject: [PATCH] cmd/compile: optimize ARM's math.Abs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This CL optimizes math.Abs to an inline ABSD instruction on ARM. The benchmark results of src/math/ show big improvements. name old time/op new time/op delta Acos-4 181ns ± 0% 182ns ± 0% +0.30% (p=0.000 n=40+40) Acosh-4 202ns ± 0% 202ns ± 0% ~ (all equal) Asin-4 163ns ± 0% 163ns ± 0% ~ (all equal) Asinh-4 242ns ± 0% 242ns ± 0% ~ (all equal) Atan-4 120ns ± 0% 121ns ± 0% +0.83% (p=0.000 n=40+40) Atanh-4 202ns ± 0% 202ns ± 0% ~ (all equal) Atan2-4 173ns ± 0% 173ns ± 0% ~ (all equal) Cbrt-4 1.06µs ± 0% 1.06µs ± 0% +0.09% (p=0.000 n=39+37) Ceil-4 72.9ns ± 0% 72.8ns ± 0% ~ (p=0.237 n=40+40) Copysign-4 13.2ns ± 0% 13.2ns ± 0% ~ (all equal) Cos-4 193ns ± 0% 183ns ± 0% -5.18% (p=0.000 n=40+40) Cosh-4 254ns ± 0% 239ns ± 0% -5.91% (p=0.000 n=40+40) Erf-4 112ns ± 0% 112ns ± 0% ~ (all equal) Erfc-4 117ns ± 0% 117ns ± 0% ~ (all equal) Erfinv-4 127ns ± 0% 127ns ± 1% ~ (p=0.492 n=40+40) Erfcinv-4 128ns ± 0% 128ns ± 0% ~ (all equal) Exp-4 212ns ± 0% 206ns ± 0% -3.05% (p=0.000 n=40+40) ExpGo-4 216ns ± 0% 209ns ± 0% -3.24% (p=0.000 n=40+40) Expm1-4 142ns ± 0% 142ns ± 0% ~ (all equal) Exp2-4 191ns ± 0% 184ns ± 0% -3.45% (p=0.000 n=40+40) Exp2Go-4 194ns ± 0% 187ns ± 0% -3.61% (p=0.000 n=40+40) Abs-4 14.4ns ± 0% 6.3ns ± 0% -56.39% (p=0.000 n=38+39) Dim-4 12.6ns ± 0% 12.6ns ± 0% ~ (all equal) Floor-4 49.6ns ± 0% 49.6ns ± 0% ~ (all equal) Max-4 27.6ns ± 0% 27.6ns ± 0% ~ (all equal) Min-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal) Mod-4 349ns ± 0% 305ns ± 1% -12.55% (p=0.000 n=33+40) Frexp-4 54.0ns ± 0% 47.1ns ± 0% -12.78% (p=0.000 n=38+38) Gamma-4 242ns ± 0% 234ns ± 0% -3.16% (p=0.000 n=36+40) Hypot-4 84.8ns ± 0% 67.8ns ± 0% -20.05% (p=0.000 n=31+35) HypotGo-4 88.5ns ± 0% 71.6ns ± 0% -19.12% (p=0.000 n=40+38) Ilogb-4 45.8ns ± 0% 38.9ns ± 0% -15.12% (p=0.000 n=40+32) J0-4 821ns ± 0% 802ns ± 0% -2.33% (p=0.000 n=33+40) J1-4 816ns ± 0% 807ns ± 0% -1.05% (p=0.000 n=40+29) Jn-4 1.67µs ± 0% 1.65µs ± 0% -1.45% (p=0.000 n=40+39) Ldexp-4 61.5ns ± 0% 54.6ns ± 0% -11.27% (p=0.000 n=40+32) Lgamma-4 188ns ± 0% 188ns ± 0% ~ (all equal) Log-4 154ns ± 0% 147ns ± 0% -4.78% (p=0.000 n=40+40) Logb-4 50.9ns ± 0% 42.7ns ± 0% -16.11% (p=0.000 n=34+39) Log1p-4 160ns ± 0% 159ns ± 0% ~ (p=0.828 n=40+40) Log10-4 173ns ± 0% 166ns ± 0% -4.05% (p=0.000 n=40+40) Log2-4 65.3ns ± 0% 58.4ns ± 0% -10.57% (p=0.000 n=37+37) Modf-4 36.4ns ± 0% 36.4ns ± 0% ~ (all equal) Nextafter32-4 36.4ns ± 0% 36.4ns ± 0% ~ (all equal) Nextafter64-4 32.7ns ± 0% 32.6ns ± 0% ~ (p=0.375 n=40+40) PowInt-4 300ns ± 0% 277ns ± 0% -7.78% (p=0.000 n=40+40) PowFrac-4 676ns ± 0% 635ns ± 0% -6.00% (p=0.000 n=40+35) Pow10Pos-4 17.6ns ± 0% 17.6ns ± 0% ~ (all equal) Pow10Neg-4 22.0ns ± 0% 22.0ns ± 0% ~ (all equal) Round-4 30.1ns ± 0% 30.1ns ± 0% ~ (all equal) RoundToEven-4 38.9ns ± 0% 38.9ns ± 0% ~ (all equal) Remainder-4 291ns ± 0% 263ns ± 0% -9.62% (p=0.000 n=40+40) Signbit-4 11.3ns ± 0% 11.3ns ± 0% ~ (all equal) Sin-4 185ns ± 0% 185ns ± 0% ~ (all equal) Sincos-4 230ns ± 0% 230ns ± 0% ~ (all equal) Sinh-4 253ns ± 0% 246ns ± 0% -2.77% (p=0.000 n=39+39) SqrtIndirect-4 41.4ns ± 0% 41.4ns ± 0% ~ (all equal) SqrtLatency-4 13.8ns ± 0% 13.8ns ± 0% ~ (all equal) SqrtIndirectLatency-4 37.0ns ± 0% 37.0ns ± 0% ~ (p=0.632 n=40+40) SqrtGoLatency-4 911ns ± 0% 911ns ± 0% +0.08% (p=0.000 n=40+40) SqrtPrime-4 13.2µs ± 0% 13.2µs ± 0% +0.01% (p=0.038 n=38+40) Tan-4 205ns ± 0% 205ns ± 0% ~ (all equal) Tanh-4 264ns ± 0% 247ns ± 0% -6.44% (p=0.000 n=39+32) Trunc-4 45.2ns ± 0% 45.2ns ± 0% ~ (all equal) Y0-4 796ns ± 0% 792ns ± 0% -0.55% (p=0.000 n=35+40) Y1-4 804ns ± 0% 797ns ± 0% -0.82% (p=0.000 n=24+40) Yn-4 1.64µs ± 0% 1.62µs ± 0% -1.27% (p=0.000 n=40+39) Float64bits-4 8.16ns ± 0% 8.16ns ± 0% +0.04% (p=0.000 n=35+40) Float64frombits-4 10.7ns ± 0% 10.7ns ± 0% ~ (all equal) Float32bits-4 7.53ns ± 0% 7.53ns ± 0% ~ (p=0.760 n=40+40) Float32frombits-4 6.91ns ± 0% 6.91ns ± 0% -0.04% (p=0.002 n=32+38) [Geo mean] 111ns 106ns -3.98% Change-Id: I54f4fd7f5160db020b430b556bde59cc0fdb996d Reviewed-on: https://go-review.googlesource.com/c/go/+/188678 Run-TryBot: Ben Shi TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/arm/ssa.go | 1 + src/cmd/compile/internal/gc/ssa.go | 2 +- src/cmd/compile/internal/ssa/gen/ARM.rules | 1 + src/cmd/compile/internal/ssa/gen/ARMOps.go | 1 + src/cmd/compile/internal/ssa/opGen.go | 14 ++++++++++++++ src/cmd/compile/internal/ssa/rewriteARM.go | 13 +++++++++++++ test/codegen/math.go | 1 + 7 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/cmd/compile/internal/arm/ssa.go b/src/cmd/compile/internal/arm/ssa.go index 16752977a8..0b798a52b9 100644 --- a/src/cmd/compile/internal/arm/ssa.go +++ b/src/cmd/compile/internal/arm/ssa.go @@ -655,6 +655,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ssa.OpARMSQRTD, ssa.OpARMNEGF, ssa.OpARMNEGD, + ssa.OpARMABSD, ssa.OpARMMOVWF, ssa.OpARMMOVWD, ssa.OpARMMOVFW, diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 9871e11a09..e1c464b843 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -3297,7 +3297,7 @@ func init() { func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpAbs, types.Types[TFLOAT64], args[0]) }, - sys.ARM64, sys.PPC64, sys.Wasm) + sys.ARM64, sys.ARM, sys.PPC64, sys.Wasm) addF("math", "Copysign", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpCopysign, types.Types[TFLOAT64], args[0], args[1]) diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules index 798611446d..87a91b1261 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM.rules @@ -56,6 +56,7 @@ (Com(32|16|8) x) -> (MVN x) (Sqrt x) -> (SQRTD x) +(Abs x) -> (ABSD x) // TODO: optimize this for ARMv5 and ARMv6 (Ctz32NonZero x) -> (Ctz32 x) diff --git a/src/cmd/compile/internal/ssa/gen/ARMOps.go b/src/cmd/compile/internal/ssa/gen/ARMOps.go index d8bdfeb86e..484f6cfe71 100644 --- a/src/cmd/compile/internal/ssa/gen/ARMOps.go +++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go @@ -211,6 +211,7 @@ func init() { {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32 {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64 {name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64 + {name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"}, // abs(arg0), float64 {name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero {name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // reverse byte order diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 00e49c97b7..f316ea16e6 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -926,6 +926,7 @@ const ( OpARMNEGF OpARMNEGD OpARMSQRTD + OpARMABSD OpARMCLZ OpARMREV OpARMREV16 @@ -12298,6 +12299,19 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "ABSD", + argLen: 1, + asm: arm.AABSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, { name: "CLZ", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go index 6a3237497c..8b569781c5 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM.go +++ b/src/cmd/compile/internal/ssa/rewriteARM.go @@ -420,6 +420,8 @@ func rewriteValueARM(v *Value) bool { return rewriteValueARM_OpARMXORshiftRLreg_0(v) case OpARMXORshiftRR: return rewriteValueARM_OpARMXORshiftRR_0(v) + case OpAbs: + return rewriteValueARM_OpAbs_0(v) case OpAdd16: return rewriteValueARM_OpAdd16_0(v) case OpAdd32: @@ -17179,6 +17181,17 @@ func rewriteValueARM_OpARMXORshiftRR_0(v *Value) bool { } return false } +func rewriteValueARM_OpAbs_0(v *Value) bool { + // match: (Abs x) + // cond: + // result: (ABSD x) + for { + x := v.Args[0] + v.reset(OpARMABSD) + v.AddArg(x) + return true + } +} func rewriteValueARM_OpAdd16_0(v *Value) bool { // match: (Add16 x y) // cond: diff --git a/test/codegen/math.go b/test/codegen/math.go index 597271ce72..36252710d1 100644 --- a/test/codegen/math.go +++ b/test/codegen/math.go @@ -63,6 +63,7 @@ func abs(x, y float64) { // ppc64:"FABS\t" // ppc64le:"FABS\t" // wasm:"F64Abs" + // arm/6:"ABSD\t" sink64[0] = math.Abs(x) // amd64:"BTRQ\t[$]63","PXOR" (TODO: this should be BTSQ) -- 2.50.0