]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: optimize ARM with MULS
authorBen Shi <powerman1st@163.com>
Fri, 25 Aug 2017 12:07:01 +0000 (12:07 +0000)
committerCherry Zhang <cherryyz@google.com>
Wed, 30 Aug 2017 13:45:08 +0000 (13:45 +0000)
MULS was introduced in ARMv7 and corresponding to MULA. This patch
duplicated all MULA related SSA rules with MULS.

Here was the contrast test result against the original go compiler.
There was no improvement in total, but big improvement in special cases.

1. A specific test case accelerated 18.62%.
(https://github.com/benshi001/ugo1/blob/master/mulsub_test.go)
name                     old time/op    new time/op    delta
MulSub-4                    270µs ± 0%     219µs ± 0%  -18.62%  (p=0.000 n=35+40)

2. Total size of all .a files in pkg/ shrank by 0.002%.

3. The compilecmp benchmark showed no decline.
name        old time/op       new time/op       delta
Template          2.37s ± 3%        2.36s ± 1%    ~     (p=0.233 n=19+18)
Unicode           1.32s ± 2%        1.34s ± 5%  +1.32%  (p=0.011 n=20+18)
GoTypes           7.88s ± 1%        7.87s ± 1%    ~     (p=0.758 n=20+20)
Compiler          37.5s ± 1%        37.6s ± 1%    ~     (p=0.194 n=20+19)
SSA               83.7s ± 2%        83.5s ± 2%    ~     (p=0.569 n=20+19)
Flate             1.46s ± 3%        1.45s ± 1%    ~     (p=0.619 n=20+17)
GoParser          1.87s ± 2%        1.85s ± 1%  -0.58%  (p=0.048 n=20+18)
Reflect           5.10s ± 2%        5.11s ± 2%    ~     (p=0.365 n=19+20)
Tar               1.78s ± 2%        1.78s ± 2%    ~     (p=0.531 n=19+20)
XML               2.62s ± 1%        2.61s ± 2%    ~     (p=0.057 n=17+19)
[Geo mean]        4.68s             4.67s       -0.07%

name        old user-time/op  new user-time/op  delta
Template          2.80s ± 1%        2.79s ± 2%    ~     (p=0.686 n=17+20)
Unicode           1.61s ± 4%        1.63s ± 6%    ~     (p=0.222 n=20+20)
GoTypes           9.59s ± 1%        9.60s ± 1%    ~     (p=0.482 n=17+20)
Compiler          46.1s ± 1%        46.2s ± 1%    ~     (p=0.373 n=20+18)
SSA                108s ± 1%         108s ± 2%    ~     (p=0.784 n=20+20)
Flate             1.68s ± 3%        1.69s ± 3%    ~     (p=0.335 n=20+19)
GoParser          2.20s ± 4%        2.19s ± 2%    ~     (p=0.844 n=20+18)
Reflect           5.97s ± 3%        6.01s ± 2%    ~     (p=0.184 n=20+20)
Tar               2.11s ± 2%        2.11s ± 4%    ~     (p=0.961 n=19+20)
XML               3.07s ± 1%        3.07s ± 3%    ~     (p=0.786 n=16+19)
[Geo mean]        5.61s             5.62s       +0.19%

name        old text-bytes    new text-bytes    delta
HelloSize         586kB ± 0%        586kB ± 0%    ~     (all equal)

name        old data-bytes    new data-bytes    delta
HelloSize        5.46kB ± 0%       5.46kB ± 0%    ~     (all equal)

name        old bss-bytes     new bss-bytes     delta
HelloSize        72.9kB ± 0%       72.9kB ± 0%    ~     (all equal)

name        old exe-bytes     new exe-bytes     delta
HelloSize        1.03MB ± 0%       1.03MB ± 0%    ~     (all equal)

4. The go1 benchmark showed no decline in total.
name                     old time/op    new time/op    delta
BinaryTree17-4              41.7s ± 1%     41.7s ± 1%    ~     (p=0.966 n=40+40)
Fannkuch11-4                23.6s ± 0%     23.6s ± 1%  -0.23%  (p=0.000 n=40+40)
FmtFprintfEmpty-4           844ns ± 1%     834ns ± 1%  -1.23%  (p=0.000 n=40+40)
FmtFprintfString-4         1.39µs ± 1%    1.40µs ± 1%  +0.71%  (p=0.000 n=40+40)
FmtFprintfInt-4            1.44µs ± 1%    1.45µs ± 1%  +0.70%  (p=0.000 n=40+40)
FmtFprintfIntInt-4         2.10µs ± 1%    2.10µs ± 1%  +0.30%  (p=0.000 n=40+40)
FmtFprintfPrefixedInt-4    2.49µs ± 0%    2.50µs ± 1%  +0.66%  (p=0.000 n=32+40)
FmtFprintfFloat-4          4.42µs ± 1%    4.46µs ± 2%  +0.94%  (p=0.000 n=40+40)
FmtManyArgs-4              8.31µs ± 1%    8.22µs ± 1%  -1.09%  (p=0.000 n=40+40)
GobDecode-4                 105ms ± 1%     102ms ± 1%  -2.30%  (p=0.000 n=39+39)
GobEncode-4                90.2ms ± 1%    88.7ms ± 1%  -1.66%  (p=0.000 n=40+39)
Gzip-4                      4.17s ± 1%     4.16s ± 1%    ~     (p=0.785 n=40+40)
Gunzip-4                    608ms ± 1%     608ms ± 1%    ~     (p=0.481 n=40+40)
HTTPClientServer-4          697µs ± 2%     684µs ± 3%  -1.89%  (p=0.000 n=37+40)
JSONEncode-4                255ms ± 1%     256ms ± 1%  +0.35%  (p=0.000 n=40+40)
JSONDecode-4                920ms ± 1%     926ms ± 1%  +0.64%  (p=0.000 n=40+39)
Mandelbrot200-4            49.3ms ± 1%    49.3ms ± 0%  +0.07%  (p=0.005 n=40+40)
GoParse-4                  46.8ms ± 2%    46.7ms ± 1%    ~     (p=1.000 n=40+40)
RegexpMatchEasy0_32-4      1.27µs ± 0%    1.27µs ± 1%    ~     (p=0.057 n=40+40)
RegexpMatchEasy0_1K-4      7.97µs ± 7%    7.92µs ± 5%    ~     (p=0.094 n=40+40)
RegexpMatchEasy1_32-4      1.28µs ± 1%    1.28µs ± 1%    ~     (p=0.406 n=40+40)
RegexpMatchEasy1_1K-4      10.5µs ± 4%    10.5µs ± 3%    ~     (p=0.855 n=40+40)
RegexpMatchMedium_32-4     2.04µs ± 0%    2.04µs ± 1%  -0.22%  (p=0.000 n=39+40)
RegexpMatchMedium_1K-4      541µs ± 0%     540µs ± 1%  -0.25%  (p=0.000 n=40+38)
RegexpMatchHard_32-4       29.3µs ± 1%    29.3µs ± 0%    ~     (p=0.149 n=40+40)
RegexpMatchHard_1K-4        878µs ± 1%     880µs ± 0%  +0.14%  (p=0.005 n=36+35)
Revcomp-4                  81.8ms ± 2%    81.4ms ± 2%  -0.43%  (p=0.015 n=38+39)
Template-4                  1.05s ± 1%     1.05s ± 1%    ~     (p=0.302 n=40+35)
TimeParse-4                7.18µs ± 1%    7.26µs ± 1%  +1.05%  (p=0.000 n=40+36)
TimeFormat-4               13.1µs ± 1%    13.1µs ± 1%    ~     (p=0.698 n=37+40)
[Geo mean]                  733µs          732µs       -0.16%

name                     old speed      new speed      delta
GobDecode-4              7.34MB/s ± 1%  7.51MB/s ± 1%  +2.36%  (p=0.000 n=39+39)
GobEncode-4              8.51MB/s ± 1%  8.65MB/s ± 1%  +1.69%  (p=0.000 n=40+39)
Gzip-4                   4.66MB/s ± 1%  4.66MB/s ± 1%    ~     (p=0.783 n=40+40)
Gunzip-4                 31.9MB/s ± 1%  31.9MB/s ± 1%    ~     (p=0.466 n=40+40)
JSONEncode-4             7.61MB/s ± 1%  7.58MB/s ± 1%  -0.35%  (p=0.001 n=40+40)
JSONDecode-4             2.11MB/s ± 1%  2.10MB/s ± 1%  -0.52%  (p=0.000 n=38+39)
GoParse-4                1.24MB/s ± 2%  1.24MB/s ± 1%    ~     (p=0.556 n=40+39)
RegexpMatchEasy0_32-4    25.1MB/s ± 0%  25.1MB/s ± 1%    ~     (p=0.064 n=40+40)
RegexpMatchEasy0_1K-4     129MB/s ± 8%   129MB/s ± 5%    ~     (p=0.094 n=40+40)
RegexpMatchEasy1_32-4    25.0MB/s ± 1%  25.1MB/s ± 1%    ~     (p=0.331 n=40+40)
RegexpMatchEasy1_1K-4    97.7MB/s ± 4%  97.8MB/s ± 3%    ~     (p=0.851 n=40+40)
RegexpMatchMedium_32-4    490kB/s ± 0%   490kB/s ± 0%    ~     (all equal)
RegexpMatchMedium_1K-4   1.89MB/s ± 0%  1.90MB/s ± 1%  +0.12%  (p=0.031 n=40+40)
RegexpMatchHard_32-4     1.09MB/s ± 1%  1.09MB/s ± 1%    ~     (p=0.597 n=40+40)
RegexpMatchHard_1K-4     1.16MB/s ± 1%  1.16MB/s ± 1%    ~     (p=0.565 n=40+35)
Revcomp-4                31.1MB/s ± 2%  31.2MB/s ± 2%  +0.44%  (p=0.018 n=38+39)
Template-4               1.85MB/s ± 1%  1.85MB/s ± 1%    ~     (p=0.873 n=40+40)
[Geo mean]               6.66MB/s       6.67MB/s       +0.26%

Change-Id: Icc972d8a78ea06c32c3aa15733ff0537c82c2dc7
Reviewed-on: https://go-review.googlesource.com/58950
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>

src/cmd/compile/internal/arm/ssa.go
src/cmd/compile/internal/ssa/gen/ARM.rules
src/cmd/compile/internal/ssa/gen/ARMOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteARM.go

index d0d864d25ddd2d6e29a6e8db5cb8743fac02db5a..79511191279b7ccf53ceb1492b82a8b06a43d528 100644 (file)
@@ -402,7 +402,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.To.Type = obj.TYPE_REGREG
                p.To.Reg = v.Reg0()           // high 32-bit
                p.To.Offset = int64(v.Reg1()) // low 32-bit
-       case ssa.OpARMMULA:
+       case ssa.OpARMMULA, ssa.OpARMMULS:
                p := s.Prog(v.Op.Asm())
                p.From.Type = obj.TYPE_REG
                p.From.Reg = v.Args[0].Reg()
index b80747ad31600993ef9b21491815cdcf933eb30a..9062453e4790dce3ddcc58b34d7ff7f5b46be332 100644 (file)
 (MULA (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (ADD (SLLconst <x.Type> [log2(c/7)] (RSBshiftLL <x.Type> x x [3])) a)
 (MULA (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (ADD (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])) a)
 
+(MULS x (MOVWconst [c]) a) && int32(c) == -1 -> (ADD a x)
+(MULS _ (MOVWconst [0]) a) -> a
+(MULS x (MOVWconst [1]) a) -> (RSB x a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo(c) -> (RSB (SLLconst <x.Type> [log2(c)] x) a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo(c-1) && int32(c) >= 3 -> (RSB (ADDshiftLL <x.Type> x x [log2(c-1)]) a)
+(MULS x (MOVWconst [c]) a) && isPowerOfTwo(c+1) && int32(c) >= 7 -> (RSB (RSBshiftLL <x.Type> x x [log2(c+1)]) a)
+(MULS x (MOVWconst [c]) a) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])) a)
+(MULS x (MOVWconst [c]) a) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])) a)
+(MULS x (MOVWconst [c]) a) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/7)] (RSBshiftLL <x.Type> x x [3])) a)
+(MULS x (MOVWconst [c]) a) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])) a)
+
+(MULS (MOVWconst [c]) x a) && int32(c) == -1 -> (ADD a x)
+(MULS (MOVWconst [0]) _ a) -> a
+(MULS (MOVWconst [1]) x a) -> (RSB x a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo(c) -> (RSB (SLLconst <x.Type> [log2(c)] x) a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo(c-1) && int32(c) >= 3 -> (RSB (ADDshiftLL <x.Type> x x [log2(c-1)]) a)
+(MULS (MOVWconst [c]) x a) && isPowerOfTwo(c+1) && int32(c) >= 7 -> (RSB (RSBshiftLL <x.Type> x x [log2(c+1)]) a)
+(MULS (MOVWconst [c]) x a) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])) a)
+(MULS (MOVWconst [c]) x a) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])) a)
+(MULS (MOVWconst [c]) x a) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/7)] (RSBshiftLL <x.Type> x x [3])) a)
+(MULS (MOVWconst [c]) x a) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (RSB (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])) a)
+
 // div by constant
 (Select0 (CALLudiv x (MOVWconst [1]))) -> x
 (Select1 (CALLudiv _ (MOVWconst [1]))) -> (MOVWconst [0])
 (BIC x x) -> (MOVWconst [0])
 
 (ADD (MUL x y) a) -> (MULA x y a)
+(SUB a (MUL x y)) && objabi.GOARM == 7 -> (MULS x y a)
+(RSB (MUL x y) a) && objabi.GOARM == 7 -> (MULS x y a)
 
 (AND x (MVN y)) -> (BIC x y)
 
index 8cbb73f291224d70bfff75a5da7bdb24165c6d6a..2a041d134074d17e096e7f1fd3037f54f80b0211 100644 (file)
@@ -168,6 +168,7 @@ func init() {
 
                {name: "MULLU", argLength: 2, reg: gp22, asm: "MULLU", commutative: true}, // arg0 * arg1, high 32 bits in out0, low 32 bits in out1
                {name: "MULA", argLength: 3, reg: gp31, asm: "MULA"},                      // arg0 * arg1 + arg2
+               {name: "MULS", argLength: 3, reg: gp31, asm: "MULS"},                      // arg2 - arg0 * arg1
 
                {name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
                {name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
index 8d249dd19c972a55543a181b851ba51009ec1aa3..eec57b8b038756cfcc0aab4c5c78e24c0eebdc14 100644 (file)
@@ -694,6 +694,7 @@ const (
        OpARMRSCconst
        OpARMMULLU
        OpARMMULA
+       OpARMMULS
        OpARMADDF
        OpARMADDD
        OpARMSUBF
@@ -8456,6 +8457,21 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "MULS",
+               argLen: 3,
+               asm:    arm.AMULS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
+                               {1, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
+                               {2, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
+                       },
+                       outputs: []outputInfo{
+                               {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
+                       },
+               },
+       },
        {
                name:        "ADDF",
                argLen:      2,
index 095f8c135667d2706b5f2457fd0cc0edd6253ad7..64aec25bda804761ddad046e2a47d4221adbea9e 100644 (file)
@@ -201,6 +201,8 @@ func rewriteValueARM(v *Value) bool {
                return rewriteValueARM_OpARMMUL_0(v) || rewriteValueARM_OpARMMUL_10(v) || rewriteValueARM_OpARMMUL_20(v)
        case OpARMMULA:
                return rewriteValueARM_OpARMMULA_0(v) || rewriteValueARM_OpARMMULA_10(v) || rewriteValueARM_OpARMMULA_20(v)
+       case OpARMMULS:
+               return rewriteValueARM_OpARMMULS_0(v) || rewriteValueARM_OpARMMULS_10(v)
        case OpARMMVN:
                return rewriteValueARM_OpARMMVN_0(v)
        case OpARMMVNshiftLL:
@@ -9571,6 +9573,488 @@ func rewriteValueARM_OpARMMULA_20(v *Value) bool {
        }
        return false
 }
+func rewriteValueARM_OpARMMULS_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: int32(c) == -1
+       // result: (ADD a x)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(int32(c) == -1) {
+                       break
+               }
+               v.reset(OpARMADD)
+               v.AddArg(a)
+               v.AddArg(x)
+               return true
+       }
+       // match: (MULS _ (MOVWconst [0]) a)
+       // cond:
+       // result: a
+       for {
+               _ = v.Args[2]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               if v_1.AuxInt != 0 {
+                       break
+               }
+               a := v.Args[2]
+               v.reset(OpCopy)
+               v.Type = a.Type
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [1]) a)
+       // cond:
+       // result: (RSB x a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               if v_1.AuxInt != 1 {
+                       break
+               }
+               a := v.Args[2]
+               v.reset(OpARMRSB)
+               v.AddArg(x)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: isPowerOfTwo(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c)] x) a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(isPowerOfTwo(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: isPowerOfTwo(c-1) && int32(c) >= 3
+       // result: (RSB (ADDshiftLL <x.Type> x x [log2(c-1)]) a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(isPowerOfTwo(c-1) && int32(c) >= 3) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v0.AuxInt = log2(c - 1)
+               v0.AddArg(x)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: isPowerOfTwo(c+1) && int32(c) >= 7
+       // result: (RSB (RSBshiftLL <x.Type> x x [log2(c+1)]) a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(isPowerOfTwo(c+1) && int32(c) >= 7) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type)
+               v0.AuxInt = log2(c + 1)
+               v0.AddArg(x)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])) a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 3)
+               v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v1.AuxInt = 1
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])) a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 5)
+               v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v1.AuxInt = 2
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/7)] (RSBshiftLL <x.Type> x x [3])) a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 7)
+               v1 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type)
+               v1.AuxInt = 3
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS x (MOVWconst [c]) a)
+       // cond: c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])) a)
+       for {
+               _ = v.Args[2]
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_1.AuxInt
+               a := v.Args[2]
+               if !(c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 9)
+               v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v1.AuxInt = 3
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       return false
+}
+func rewriteValueARM_OpARMMULS_10(v *Value) bool {
+       b := v.Block
+       _ = b
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: int32(c) == -1
+       // result: (ADD a x)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(int32(c) == -1) {
+                       break
+               }
+               v.reset(OpARMADD)
+               v.AddArg(a)
+               v.AddArg(x)
+               return true
+       }
+       // match: (MULS (MOVWconst [0]) _ a)
+       // cond:
+       // result: a
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               if v_0.AuxInt != 0 {
+                       break
+               }
+               a := v.Args[2]
+               v.reset(OpCopy)
+               v.Type = a.Type
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [1]) x a)
+       // cond:
+       // result: (RSB x a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               if v_0.AuxInt != 1 {
+                       break
+               }
+               x := v.Args[1]
+               a := v.Args[2]
+               v.reset(OpARMRSB)
+               v.AddArg(x)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: isPowerOfTwo(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c)] x) a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(isPowerOfTwo(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: isPowerOfTwo(c-1) && int32(c) >= 3
+       // result: (RSB (ADDshiftLL <x.Type> x x [log2(c-1)]) a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(isPowerOfTwo(c-1) && int32(c) >= 3) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v0.AuxInt = log2(c - 1)
+               v0.AddArg(x)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: isPowerOfTwo(c+1) && int32(c) >= 7
+       // result: (RSB (RSBshiftLL <x.Type> x x [log2(c+1)]) a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(isPowerOfTwo(c+1) && int32(c) >= 7) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type)
+               v0.AuxInt = log2(c + 1)
+               v0.AddArg(x)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])) a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 3)
+               v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v1.AuxInt = 1
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])) a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 5)
+               v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v1.AuxInt = 2
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/7)] (RSBshiftLL <x.Type> x x [3])) a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 7)
+               v1 := b.NewValue0(v.Pos, OpARMRSBshiftLL, x.Type)
+               v1.AuxInt = 3
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       // match: (MULS (MOVWconst [c]) x a)
+       // cond: c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)
+       // result: (RSB (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])) a)
+       for {
+               _ = v.Args[2]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v.Args[1]
+               a := v.Args[2]
+               if !(c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)) {
+                       break
+               }
+               v.reset(OpARMRSB)
+               v0 := b.NewValue0(v.Pos, OpARMSLLconst, x.Type)
+               v0.AuxInt = log2(c / 9)
+               v1 := b.NewValue0(v.Pos, OpARMADDshiftLL, x.Type)
+               v1.AuxInt = 3
+               v1.AddArg(x)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               v.AddArg(a)
+               return true
+       }
+       return false
+}
 func rewriteValueARM_OpARMMVN_0(v *Value) bool {
        // match: (MVN (MOVWconst [c]))
        // cond:
@@ -10835,6 +11319,28 @@ func rewriteValueARM_OpARMRSB_10(v *Value) bool {
                v.AuxInt = 0
                return true
        }
+       // match: (RSB (MUL x y) a)
+       // cond: objabi.GOARM == 7
+       // result: (MULS x y a)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMUL {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               if !(objabi.GOARM == 7) {
+                       break
+               }
+               v.reset(OpARMMULS)
+               v.AddArg(x)
+               v.AddArg(y)
+               v.AddArg(a)
+               return true
+       }
        return false
 }
 func rewriteValueARM_OpARMRSBSshiftLL_0(v *Value) bool {
@@ -12865,6 +13371,28 @@ func rewriteValueARM_OpARMSUB_10(v *Value) bool {
                v.AuxInt = 0
                return true
        }
+       // match: (SUB a (MUL x y))
+       // cond: objabi.GOARM == 7
+       // result: (MULS x y a)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARMMUL {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               if !(objabi.GOARM == 7) {
+                       break
+               }
+               v.reset(OpARMMULS)
+               v.AddArg(x)
+               v.AddArg(y)
+               v.AddArg(a)
+               return true
+       }
        return false
 }
 func rewriteValueARM_OpARMSUBS_0(v *Value) bool {