From 64607dbd26612850d64c4422f14001387956d022 Mon Sep 17 00:00:00 2001 From: Ben Shi Date: Fri, 25 Aug 2017 12:07:01 +0000 Subject: [PATCH] cmd/compile: optimize ARM with MULS MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit MULS was introduced in ARMv7 and corresponding to MULA. This patch duplicated all MULA related SSA rules with MULS. Here was the contrast test result against the original go compiler. There was no improvement in total, but big improvement in special cases. 1. A specific test case accelerated 18.62%. (https://github.com/benshi001/ugo1/blob/master/mulsub_test.go) name old time/op new time/op delta MulSub-4 270µs ± 0% 219µs ± 0% -18.62% (p=0.000 n=35+40) 2. Total size of all .a files in pkg/ shrank by 0.002%. 3. The compilecmp benchmark showed no decline. name old time/op new time/op delta Template 2.37s ± 3% 2.36s ± 1% ~ (p=0.233 n=19+18) Unicode 1.32s ± 2% 1.34s ± 5% +1.32% (p=0.011 n=20+18) GoTypes 7.88s ± 1% 7.87s ± 1% ~ (p=0.758 n=20+20) Compiler 37.5s ± 1% 37.6s ± 1% ~ (p=0.194 n=20+19) SSA 83.7s ± 2% 83.5s ± 2% ~ (p=0.569 n=20+19) Flate 1.46s ± 3% 1.45s ± 1% ~ (p=0.619 n=20+17) GoParser 1.87s ± 2% 1.85s ± 1% -0.58% (p=0.048 n=20+18) Reflect 5.10s ± 2% 5.11s ± 2% ~ (p=0.365 n=19+20) Tar 1.78s ± 2% 1.78s ± 2% ~ (p=0.531 n=19+20) XML 2.62s ± 1% 2.61s ± 2% ~ (p=0.057 n=17+19) [Geo mean] 4.68s 4.67s -0.07% name old user-time/op new user-time/op delta Template 2.80s ± 1% 2.79s ± 2% ~ (p=0.686 n=17+20) Unicode 1.61s ± 4% 1.63s ± 6% ~ (p=0.222 n=20+20) GoTypes 9.59s ± 1% 9.60s ± 1% ~ (p=0.482 n=17+20) Compiler 46.1s ± 1% 46.2s ± 1% ~ (p=0.373 n=20+18) SSA 108s ± 1% 108s ± 2% ~ (p=0.784 n=20+20) Flate 1.68s ± 3% 1.69s ± 3% ~ (p=0.335 n=20+19) GoParser 2.20s ± 4% 2.19s ± 2% ~ (p=0.844 n=20+18) Reflect 5.97s ± 3% 6.01s ± 2% ~ (p=0.184 n=20+20) Tar 2.11s ± 2% 2.11s ± 4% ~ (p=0.961 n=19+20) XML 3.07s ± 1% 3.07s ± 3% ~ (p=0.786 n=16+19) [Geo mean] 5.61s 5.62s +0.19% name old text-bytes new text-bytes delta HelloSize 586kB ± 0% 586kB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 5.46kB ± 0% 5.46kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 72.9kB ± 0% 72.9kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.03MB ± 0% 1.03MB ± 0% ~ (all equal) 4. The go1 benchmark showed no decline in total. name old time/op new time/op delta BinaryTree17-4 41.7s ± 1% 41.7s ± 1% ~ (p=0.966 n=40+40) Fannkuch11-4 23.6s ± 0% 23.6s ± 1% -0.23% (p=0.000 n=40+40) FmtFprintfEmpty-4 844ns ± 1% 834ns ± 1% -1.23% (p=0.000 n=40+40) FmtFprintfString-4 1.39µs ± 1% 1.40µs ± 1% +0.71% (p=0.000 n=40+40) FmtFprintfInt-4 1.44µs ± 1% 1.45µs ± 1% +0.70% (p=0.000 n=40+40) FmtFprintfIntInt-4 2.10µs ± 1% 2.10µs ± 1% +0.30% (p=0.000 n=40+40) FmtFprintfPrefixedInt-4 2.49µs ± 0% 2.50µs ± 1% +0.66% (p=0.000 n=32+40) FmtFprintfFloat-4 4.42µs ± 1% 4.46µs ± 2% +0.94% (p=0.000 n=40+40) FmtManyArgs-4 8.31µs ± 1% 8.22µs ± 1% -1.09% (p=0.000 n=40+40) GobDecode-4 105ms ± 1% 102ms ± 1% -2.30% (p=0.000 n=39+39) GobEncode-4 90.2ms ± 1% 88.7ms ± 1% -1.66% (p=0.000 n=40+39) Gzip-4 4.17s ± 1% 4.16s ± 1% ~ (p=0.785 n=40+40) Gunzip-4 608ms ± 1% 608ms ± 1% ~ (p=0.481 n=40+40) HTTPClientServer-4 697µs ± 2% 684µs ± 3% -1.89% (p=0.000 n=37+40) JSONEncode-4 255ms ± 1% 256ms ± 1% +0.35% (p=0.000 n=40+40) JSONDecode-4 920ms ± 1% 926ms ± 1% +0.64% (p=0.000 n=40+39) Mandelbrot200-4 49.3ms ± 1% 49.3ms ± 0% +0.07% (p=0.005 n=40+40) GoParse-4 46.8ms ± 2% 46.7ms ± 1% ~ (p=1.000 n=40+40) RegexpMatchEasy0_32-4 1.27µs ± 0% 1.27µs ± 1% ~ (p=0.057 n=40+40) RegexpMatchEasy0_1K-4 7.97µs ± 7% 7.92µs ± 5% ~ (p=0.094 n=40+40) RegexpMatchEasy1_32-4 1.28µs ± 1% 1.28µs ± 1% ~ (p=0.406 n=40+40) RegexpMatchEasy1_1K-4 10.5µs ± 4% 10.5µs ± 3% ~ (p=0.855 n=40+40) RegexpMatchMedium_32-4 2.04µs ± 0% 2.04µs ± 1% -0.22% (p=0.000 n=39+40) RegexpMatchMedium_1K-4 541µs ± 0% 540µs ± 1% -0.25% (p=0.000 n=40+38) RegexpMatchHard_32-4 29.3µs ± 1% 29.3µs ± 0% ~ (p=0.149 n=40+40) RegexpMatchHard_1K-4 878µs ± 1% 880µs ± 0% +0.14% (p=0.005 n=36+35) Revcomp-4 81.8ms ± 2% 81.4ms ± 2% -0.43% (p=0.015 n=38+39) Template-4 1.05s ± 1% 1.05s ± 1% ~ (p=0.302 n=40+35) TimeParse-4 7.18µs ± 1% 7.26µs ± 1% +1.05% (p=0.000 n=40+36) TimeFormat-4 13.1µs ± 1% 13.1µs ± 1% ~ (p=0.698 n=37+40) [Geo mean] 733µs 732µs -0.16% name old speed new speed delta GobDecode-4 7.34MB/s ± 1% 7.51MB/s ± 1% +2.36% (p=0.000 n=39+39) GobEncode-4 8.51MB/s ± 1% 8.65MB/s ± 1% +1.69% (p=0.000 n=40+39) Gzip-4 4.66MB/s ± 1% 4.66MB/s ± 1% ~ (p=0.783 n=40+40) Gunzip-4 31.9MB/s ± 1% 31.9MB/s ± 1% ~ (p=0.466 n=40+40) JSONEncode-4 7.61MB/s ± 1% 7.58MB/s ± 1% -0.35% (p=0.001 n=40+40) JSONDecode-4 2.11MB/s ± 1% 2.10MB/s ± 1% -0.52% (p=0.000 n=38+39) GoParse-4 1.24MB/s ± 2% 1.24MB/s ± 1% ~ (p=0.556 n=40+39) RegexpMatchEasy0_32-4 25.1MB/s ± 0% 25.1MB/s ± 1% ~ (p=0.064 n=40+40) RegexpMatchEasy0_1K-4 129MB/s ± 8% 129MB/s ± 5% ~ (p=0.094 n=40+40) RegexpMatchEasy1_32-4 25.0MB/s ± 1% 25.1MB/s ± 1% ~ (p=0.331 n=40+40) RegexpMatchEasy1_1K-4 97.7MB/s ± 4% 97.8MB/s ± 3% ~ (p=0.851 n=40+40) RegexpMatchMedium_32-4 490kB/s ± 0% 490kB/s ± 0% ~ (all equal) RegexpMatchMedium_1K-4 1.89MB/s ± 0% 1.90MB/s ± 1% +0.12% (p=0.031 n=40+40) RegexpMatchHard_32-4 1.09MB/s ± 1% 1.09MB/s ± 1% ~ (p=0.597 n=40+40) RegexpMatchHard_1K-4 1.16MB/s ± 1% 1.16MB/s ± 1% ~ (p=0.565 n=40+35) Revcomp-4 31.1MB/s ± 2% 31.2MB/s ± 2% +0.44% (p=0.018 n=38+39) Template-4 1.85MB/s ± 1% 1.85MB/s ± 1% ~ (p=0.873 n=40+40) [Geo mean] 6.66MB/s 6.67MB/s +0.26% Change-Id: Icc972d8a78ea06c32c3aa15733ff0537c82c2dc7 Reviewed-on: https://go-review.googlesource.com/58950 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang --- src/cmd/compile/internal/arm/ssa.go | 2 +- src/cmd/compile/internal/ssa/gen/ARM.rules | 24 + src/cmd/compile/internal/ssa/gen/ARMOps.go | 1 + src/cmd/compile/internal/ssa/opGen.go | 16 + src/cmd/compile/internal/ssa/rewriteARM.go | 528 +++++++++++++++++++++ 5 files changed, 570 insertions(+), 1 deletion(-) diff --git a/src/cmd/compile/internal/arm/ssa.go b/src/cmd/compile/internal/arm/ssa.go index d0d864d25d..7951119127 100644 --- a/src/cmd/compile/internal/arm/ssa.go +++ b/src/cmd/compile/internal/arm/ssa.go @@ -402,7 +402,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REGREG p.To.Reg = v.Reg0() // high 32-bit p.To.Offset = int64(v.Reg1()) // low 32-bit - case ssa.OpARMMULA: + case ssa.OpARMMULA, ssa.OpARMMULS: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = v.Args[0].Reg() diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules index b80747ad31..9062453e47 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM.rules @@ -602,6 +602,28 @@ (MULA (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (ADD (SLLconst [log2(c/7)] (RSBshiftLL x x [3])) a) (MULA (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (ADD (SLLconst [log2(c/9)] (ADDshiftLL x x [3])) a) +(MULS x (MOVWconst [c]) a) && int32(c) == -1 -> (ADD a x) +(MULS _ (MOVWconst [0]) a) -> a +(MULS x (MOVWconst [1]) a) -> (RSB x a) +(MULS x (MOVWconst [c]) a) && isPowerOfTwo(c) -> (RSB (SLLconst [log2(c)] x) a) +(MULS x (MOVWconst [c]) a) && isPowerOfTwo(c-1) && int32(c) >= 3 -> (RSB (ADDshiftLL x x [log2(c-1)]) a) +(MULS x (MOVWconst [c]) a) && isPowerOfTwo(c+1) && int32(c) >= 7 -> (RSB (RSBshiftLL x x [log2(c+1)]) a) +(MULS x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) -> (RSB (SLLconst [log2(c/3)] (ADDshiftLL x x [1])) a) +(MULS x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) -> (RSB (SLLconst [log2(c/5)] (ADDshiftLL x x [2])) a) +(MULS x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (RSB (SLLconst [log2(c/7)] (RSBshiftLL x x [3])) a) +(MULS x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (RSB (SLLconst [log2(c/9)] (ADDshiftLL x x [3])) a) + +(MULS (MOVWconst [c]) x a) && int32(c) == -1 -> (ADD a x) +(MULS (MOVWconst [0]) _ a) -> a +(MULS (MOVWconst [1]) x a) -> (RSB x a) +(MULS (MOVWconst [c]) x a) && isPowerOfTwo(c) -> (RSB (SLLconst [log2(c)] x) a) +(MULS (MOVWconst [c]) x a) && isPowerOfTwo(c-1) && int32(c) >= 3 -> (RSB (ADDshiftLL x x [log2(c-1)]) a) +(MULS (MOVWconst [c]) x a) && isPowerOfTwo(c+1) && int32(c) >= 7 -> (RSB (RSBshiftLL x x [log2(c+1)]) a) +(MULS (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) -> (RSB (SLLconst [log2(c/3)] (ADDshiftLL x x [1])) a) +(MULS (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) -> (RSB (SLLconst [log2(c/5)] (ADDshiftLL x x [2])) a) +(MULS (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (RSB (SLLconst [log2(c/7)] (RSBshiftLL x x [3])) a) +(MULS (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (RSB (SLLconst [log2(c/9)] (ADDshiftLL x x [3])) a) + // div by constant (Select0 (CALLudiv x (MOVWconst [1]))) -> x (Select1 (CALLudiv _ (MOVWconst [1]))) -> (MOVWconst [0]) @@ -1215,6 +1237,8 @@ (BIC x x) -> (MOVWconst [0]) (ADD (MUL x y) a) -> (MULA x y a) +(SUB a (MUL x y)) && objabi.GOARM == 7 -> (MULS x y a) +(RSB (MUL x y) a) && objabi.GOARM == 7 -> (MULS x y a) (AND x (MVN y)) -> (BIC x y) diff --git a/src/cmd/compile/internal/ssa/gen/ARMOps.go b/src/cmd/compile/internal/ssa/gen/ARMOps.go index 8cbb73f291..2a041d1340 100644 --- a/src/cmd/compile/internal/ssa/gen/ARMOps.go +++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go @@ -168,6 +168,7 @@ func init() { {name: "MULLU", argLength: 2, reg: gp22, asm: "MULLU", commutative: true}, // arg0 * arg1, high 32 bits in out0, low 32 bits in out1 {name: "MULA", argLength: 3, reg: gp31, asm: "MULA"}, // arg0 * arg1 + arg2 + {name: "MULS", argLength: 3, reg: gp31, asm: "MULS"}, // arg2 - arg0 * arg1 {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1 {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1 diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 8d249dd19c..eec57b8b03 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -694,6 +694,7 @@ const ( OpARMRSCconst OpARMMULLU OpARMMULA + OpARMMULS OpARMADDF OpARMADDD OpARMSUBF @@ -8456,6 +8457,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "MULS", + argLen: 3, + asm: arm.AMULS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14 + {1, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14 + {2, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14 + }, + outputs: []outputInfo{ + {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14 + }, + }, + }, { name: "ADDF", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go index 095f8c1356..64aec25bda 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM.go +++ b/src/cmd/compile/internal/ssa/rewriteARM.go @@ -201,6 +201,8 @@ func rewriteValueARM(v *Value) bool { return rewriteValueARM_OpARMMUL_0(v) || rewriteValueARM_OpARMMUL_10(v) || rewriteValueARM_OpARMMUL_20(v) case OpARMMULA: return rewriteValueARM_OpARMMULA_0(v) || rewriteValueARM_OpARMMULA_10(v) || rewriteValueARM_OpARMMULA_20(v) + case OpARMMULS: + return rewriteValueARM_OpARMMULS_0(v) || rewriteValueARM_OpARMMULS_10(v) case OpARMMVN: return rewriteValueARM_OpARMMVN_0(v) case OpARMMVNshiftLL: @@ -9571,6 +9573,488 @@ func rewriteValueARM_OpARMMULA_20(v *Value) bool { } return false } +func rewriteValueARM_OpARMMULS_0(v *Value) bool { + b := v.Block + _ = b + // match: (MULS x (MOVWconst [c]) a) + // cond: int32(c) == -1 + // result: (ADD a x) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(int32(c) == -1) { + break + } + v.reset(OpARMADD) + v.AddArg(a) + v.AddArg(x) + return true + } + // match: (MULS _ (MOVWconst [0]) a) + // cond: + // result: a + for { + _ = v.Args[2] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + if v_1.AuxInt != 0 { + break + } + a := v.Args[2] + v.reset(OpCopy) + v.Type = a.Type + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [1]) a) + // cond: + // result: (RSB x a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + if v_1.AuxInt != 1 { + break + } + a := v.Args[2] + v.reset(OpARMRSB) + v.AddArg(x) + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [c]) a) + // cond: isPowerOfTwo(c) + // result: (RSB (SLLconst [log2(c)] x) a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(isPowerOfTwo(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [c]) a) + // cond: isPowerOfTwo(c-1) && int32(c) >= 3 + // result: (RSB (ADDshiftLL x x [log2(c-1)]) a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(isPowerOfTwo(c-1) && int32(c) >= 3) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v0.AuxInt = log2(c - 1) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [c]) a) + // cond: isPowerOfTwo(c+1) && int32(c) >= 7 + // result: (RSB (RSBshiftLL x x [log2(c+1)]) a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(isPowerOfTwo(c+1) && int32(c) >= 7) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type) + v0.AuxInt = log2(c + 1) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [c]) a) + // cond: c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/3)] (ADDshiftLL x x [1])) a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 3) + v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v1.AuxInt = 1 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [c]) a) + // cond: c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/5)] (ADDshiftLL x x [2])) a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 5) + v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v1.AuxInt = 2 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [c]) a) + // cond: c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/7)] (RSBshiftLL x x [3])) a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 7) + v1 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type) + v1.AuxInt = 3 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS x (MOVWconst [c]) a) + // cond: c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/9)] (ADDshiftLL x x [3])) a) + for { + _ = v.Args[2] + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMOVWconst { + break + } + c := v_1.AuxInt + a := v.Args[2] + if !(c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 9) + v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v1.AuxInt = 3 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + return false +} +func rewriteValueARM_OpARMMULS_10(v *Value) bool { + b := v.Block + _ = b + // match: (MULS (MOVWconst [c]) x a) + // cond: int32(c) == -1 + // result: (ADD a x) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(int32(c) == -1) { + break + } + v.reset(OpARMADD) + v.AddArg(a) + v.AddArg(x) + return true + } + // match: (MULS (MOVWconst [0]) _ a) + // cond: + // result: a + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + if v_0.AuxInt != 0 { + break + } + a := v.Args[2] + v.reset(OpCopy) + v.Type = a.Type + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [1]) x a) + // cond: + // result: (RSB x a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + if v_0.AuxInt != 1 { + break + } + x := v.Args[1] + a := v.Args[2] + v.reset(OpARMRSB) + v.AddArg(x) + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [c]) x a) + // cond: isPowerOfTwo(c) + // result: (RSB (SLLconst [log2(c)] x) a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(isPowerOfTwo(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [c]) x a) + // cond: isPowerOfTwo(c-1) && int32(c) >= 3 + // result: (RSB (ADDshiftLL x x [log2(c-1)]) a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(isPowerOfTwo(c-1) && int32(c) >= 3) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v0.AuxInt = log2(c - 1) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [c]) x a) + // cond: isPowerOfTwo(c+1) && int32(c) >= 7 + // result: (RSB (RSBshiftLL x x [log2(c+1)]) a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(isPowerOfTwo(c+1) && int32(c) >= 7) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type) + v0.AuxInt = log2(c + 1) + v0.AddArg(x) + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [c]) x a) + // cond: c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/3)] (ADDshiftLL x x [1])) a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 3) + v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v1.AuxInt = 1 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [c]) x a) + // cond: c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/5)] (ADDshiftLL x x [2])) a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 5) + v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v1.AuxInt = 2 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [c]) x a) + // cond: c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/7)] (RSBshiftLL x x [3])) a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 7) + v1 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type) + v1.AuxInt = 3 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + // match: (MULS (MOVWconst [c]) x a) + // cond: c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) + // result: (RSB (SLLconst [log2(c/9)] (ADDshiftLL x x [3])) a) + for { + _ = v.Args[2] + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + c := v_0.AuxInt + x := v.Args[1] + a := v.Args[2] + if !(c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)) { + break + } + v.reset(OpARMRSB) + v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type) + v0.AuxInt = log2(c / 9) + v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type) + v1.AuxInt = 3 + v1.AddArg(x) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v.AddArg(a) + return true + } + return false +} func rewriteValueARM_OpARMMVN_0(v *Value) bool { // match: (MVN (MOVWconst [c])) // cond: @@ -10835,6 +11319,28 @@ func rewriteValueARM_OpARMRSB_10(v *Value) bool { v.AuxInt = 0 return true } + // match: (RSB (MUL x y) a) + // cond: objabi.GOARM == 7 + // result: (MULS x y a) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARMMUL { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + if !(objabi.GOARM == 7) { + break + } + v.reset(OpARMMULS) + v.AddArg(x) + v.AddArg(y) + v.AddArg(a) + return true + } return false } func rewriteValueARM_OpARMRSBSshiftLL_0(v *Value) bool { @@ -12865,6 +13371,28 @@ func rewriteValueARM_OpARMSUB_10(v *Value) bool { v.AuxInt = 0 return true } + // match: (SUB a (MUL x y)) + // cond: objabi.GOARM == 7 + // result: (MULS x y a) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMMUL { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + if !(objabi.GOARM == 7) { + break + } + v.reset(OpARMMULS) + v.AddArg(x) + v.AddArg(y) + v.AddArg(a) + return true + } return false } func rewriteValueARM_OpARMSUBS_0(v *Value) bool { -- 2.50.0