From: Ben Shi <powerman1st@163.com>
Date: Mon, 19 Feb 2018 13:13:13 +0000 (+0000)
Subject: cmd/compile: optimize ARM64 code with MNEG
X-Git-Tag: go1.11beta1~1544
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=3c8b824453b4119829d1a68eec02a0143611913d;p=gostls13.git

cmd/compile: optimize ARM64 code with MNEG

A pair of MUL/NEG instructions can be combined to a single MNEG on ARM64.
This CL implements this optimization.

1. A special test case gets big improvement.
(https://github.com/benshi001/ugo1/blob/master/mneg_test.go)
name                     old time/op    new time/op    delta
MNEG-4                      315µs ± 0%     260µs ± 0%  -17.39%  (p=0.000 n=24+25)

2. There is little change in the go1 benchmark, excluding noise.
name                     old time/op    new time/op    delta
BinaryTree17-4              42.2s ± 2%     41.9s ± 2%  -0.82%  (p=0.001 n=30+26)
Fannkuch11-4                32.9s ± 0%     32.9s ± 0%  -0.01%  (p=0.006 n=20+26)
FmtFprintfEmpty-4           541ns ± 3%     534ns ± 0%  -1.24%  (p=0.003 n=30+26)
FmtFprintfString-4         1.09µs ± 0%    1.10µs ± 3%    ~     (p=0.142 n=23+30)
FmtFprintfInt-4            1.14µs ± 0%    1.14µs ± 0%    ~     (p=0.435 n=24+24)
FmtFprintfIntInt-4         1.76µs ± 0%    1.76µs ± 0%    ~     (p=0.508 n=24+26)
FmtFprintfPrefixedInt-4    2.20µs ± 3%    2.17µs ± 0%  -1.10%  (p=0.017 n=30+24)
FmtFprintfFloat-4          3.28µs ± 0%    3.28µs ± 0%    ~     (p=0.579 n=24+24)
FmtManyArgs-4              7.30µs ± 0%    7.30µs ± 0%    ~     (p=0.662 n=26+27)
GobDecode-4                94.8ms ± 0%    94.8ms ± 0%  +0.07%  (p=0.010 n=25+23)
GobEncode-4                80.9ms ± 4%    80.6ms ± 4%    ~     (p=0.901 n=30+30)
Gzip-4                      4.45s ± 0%     4.49s ± 0%  +0.98%  (p=0.000 n=25+24)
Gunzip-4                    450ms ± 3%     443ms ± 0%    ~     (p=0.942 n=30+26)
HTTPClientServer-4          548µs ± 1%     551µs ± 1%  +0.60%  (p=0.000 n=29+30)
JSONEncode-4                210ms ± 0%     211ms ± 0%  +0.03%  (p=0.000 n=23+25)
JSONDecode-4                866ms ± 5%     877ms ± 5%    ~     (p=0.187 n=30+30)
Mandelbrot200-4            51.4ms ± 0%    52.0ms ± 3%  +1.15%  (p=0.001 n=24+30)
GoParse-4                  42.9ms ± 5%    41.9ms ± 0%  -2.24%  (p=0.000 n=30+26)
RegexpMatchEasy0_32-4      1.02µs ± 3%    1.01µs ± 0%    ~     (p=0.247 n=30+26)
RegexpMatchEasy0_1K-4      3.90µs ± 0%    3.90µs ± 0%    ~     (p=0.062 n=24+24)
RegexpMatchEasy1_32-4       955ns ± 0%     956ns ± 0%  +0.16%  (p=0.000 n=25+23)
RegexpMatchEasy1_1K-4      6.42µs ± 3%    6.37µs ± 0%  -0.81%  (p=0.012 n=30+24)
RegexpMatchMedium_32-4     1.77µs ± 3%    1.79µs ± 0%  +1.28%  (p=0.003 n=30+24)
RegexpMatchMedium_1K-4      561µs ± 0%     569µs ± 3%  +1.50%  (p=0.000 n=25+30)
RegexpMatchHard_32-4       31.0µs ± 4%    30.8µs ± 0%    ~     (p=1.000 n=26+26)
RegexpMatchHard_1K-4        945µs ± 3%     945µs ± 3%    ~     (p=0.513 n=30+30)
Revcomp-4                   7.76s ± 4%     7.68s ± 0%    ~     (p=0.464 n=29+23)
Template-4                  903ms ± 5%     904ms ± 5%    ~     (p=0.248 n=30+30)
TimeParse-4                4.80µs ± 0%    4.80µs ± 0%    ~     (p=0.081 n=25+26)
TimeFormat-4               4.70µs ± 1%    4.70µs ± 1%    ~     (p=0.763 n=24+26)
[Geo mean]                  709µs          708µs       -0.09%

name                     old speed      new speed      delta
GobDecode-4              8.10MB/s ± 0%  8.09MB/s ± 0%    ~     (p=0.160 n=25+23)
GobEncode-4              9.49MB/s ± 4%  9.53MB/s ± 4%    ~     (p=0.360 n=30+30)
Gzip-4                   4.36MB/s ± 0%  4.32MB/s ± 0%  -0.92%  (p=0.000 n=25+24)
Gunzip-4                 43.2MB/s ± 3%  43.8MB/s ± 0%    ~     (p=0.980 n=30+26)
JSONEncode-4             9.22MB/s ± 0%  9.22MB/s ± 0%  -0.04%  (p=0.005 n=23+25)
JSONDecode-4             2.24MB/s ± 5%  2.21MB/s ± 4%    ~     (p=0.252 n=30+30)
GoParse-4                1.35MB/s ± 5%  1.38MB/s ± 0%  +2.00%  (p=0.003 n=30+26)
RegexpMatchEasy0_32-4    31.5MB/s ± 3%  31.8MB/s ± 0%    ~     (p=0.110 n=30+26)
RegexpMatchEasy0_1K-4     263MB/s ± 0%   263MB/s ± 0%    ~     (p=0.111 n=24+24)
RegexpMatchEasy1_32-4    33.5MB/s ± 0%  33.4MB/s ± 0%  -0.16%  (p=0.003 n=25+23)
RegexpMatchEasy1_1K-4     160MB/s ± 3%   161MB/s ± 0%  +0.78%  (p=0.012 n=30+24)
RegexpMatchMedium_32-4    565kB/s ± 3%   560kB/s ± 0%  -0.83%  (p=0.001 n=30+24)
RegexpMatchMedium_1K-4   1.83MB/s ± 0%  1.80MB/s ± 3%  -1.56%  (p=0.000 n=25+30)
RegexpMatchHard_32-4     1.03MB/s ± 3%  1.04MB/s ± 0%  +1.46%  (p=0.000 n=30+26)
RegexpMatchHard_1K-4     1.08MB/s ± 3%  1.09MB/s ± 3%    ~     (p=0.444 n=30+30)
Revcomp-4                32.8MB/s ± 4%  33.1MB/s ± 0%    ~     (p=0.858 n=29+23)
Template-4               2.15MB/s ± 5%  2.15MB/s ± 5%    ~     (p=0.646 n=30+30)
[Geo mean]               7.79MB/s       7.81MB/s       +0.21%

3. There is no regression in the compilecmp benchmark.
name        old time/op       new time/op       delta
Template          2.35s ± 4%        2.33s ± 3%    ~     (p=0.796 n=10+10)
Unicode           1.35s ± 6%        1.35s ± 5%    ~     (p=1.000 n=9+10)
GoTypes           8.10s ± 3%        8.14s ± 3%    ~     (p=0.604 n=9+10)
Compiler          40.5s ± 2%        40.2s ± 2%    ~     (p=0.065 n=10+9)
SSA                115s ± 2%         115s ± 2%    ~     (p=0.447 n=9+10)
Flate             1.45s ± 3%        1.45s ± 4%    ~     (p=0.739 n=10+10)
GoParser          1.85s ± 3%        1.86s ± 2%    ~     (p=0.853 n=10+10)
Reflect           5.11s ± 2%        5.10s ± 2%    ~     (p=0.971 n=10+10)
Tar               2.23s ± 5%        2.23s ± 3%    ~     (p=0.796 n=10+10)
XML               2.67s ± 2%        2.69s ± 2%    ~     (p=0.549 n=9+10)
[Geo mean]        5.00s             5.00s       +0.02%

name        old user-time/op  new user-time/op  delta
Template          2.88s ± 2%        2.86s ± 2%    ~     (p=0.529 n=10+10)
Unicode           1.70s ± 7%        1.69s ± 5%    ~     (p=0.853 n=10+10)
GoTypes           9.72s ± 1%        9.73s ± 1%    ~     (p=0.684 n=10+10)
Compiler          49.0s ± 1%        48.9s ± 1%    ~     (p=0.631 n=10+10)
SSA                144s ± 1%         144s ± 2%    ~     (p=0.684 n=10+10)
Flate             1.71s ± 4%        1.72s ± 4%    ~     (p=0.853 n=10+10)
GoParser          2.23s ± 2%        2.23s ± 2%    ~     (p=0.971 n=10+10)
Reflect           5.98s ± 2%        5.96s ± 2%    ~     (p=0.481 n=10+10)
Tar               2.68s ± 3%        2.67s ± 2%    ~     (p=0.393 n=10+10)
XML               3.21s ± 3%        3.22s ± 1%    ~     (p=0.604 n=10+9)
[Geo mean]        6.05s             6.05s       -0.04%

name        old text-bytes    new text-bytes    delta
HelloSize         641kB ± 0%        641kB ± 0%    ~     (all equal)

name        old data-bytes    new data-bytes    delta
HelloSize        9.46kB ± 0%       9.46kB ± 0%    ~     (all equal)

name        old bss-bytes     new bss-bytes     delta
HelloSize         125kB ± 0%        125kB ± 0%    ~     (all equal)

name        old exe-bytes     new exe-bytes     delta
HelloSize        1.24MB ± 0%       1.24MB ± 0%    ~     (all equal)

Change-Id: I9ed9128f0114e0f1ebb08ca2d042c90fcb2b1dcd
Reviewed-on: https://go-review.googlesource.com/95075
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---

diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go
index 018cdff03d..574fb9ff73 100644
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@@ -150,6 +150,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpARM64BIC,
 		ssa.OpARM64MUL,
 		ssa.OpARM64MULW,
+		ssa.OpARM64MNEG,
+		ssa.OpARM64MNEGW,
 		ssa.OpARM64MULH,
 		ssa.OpARM64UMULH,
 		ssa.OpARM64MULL,
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
index 0e269dc753..5132e42b6d 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -810,6 +810,12 @@
 (CMPW x (MOVDconst [c])) -> (CMPWconst [int64(int32(c))] x)
 (CMPW (MOVDconst [c]) x) -> (InvertFlags (CMPWconst [int64(int32(c))] x))
 
+// mul-neg -> mneg
+(NEG (MUL x y)) -> (MNEG x y)
+(NEG (MULW x y)) -> (MNEGW x y)
+(MUL (NEG x) y) -> (MNEG x y)
+(MULW (NEG x) y) -> (MNEGW x y)
+
 // mul by constant
 (MUL x (MOVDconst [-1])) -> (NEG x)
 (MUL _ (MOVDconst [0])) -> (MOVDconst [0])
@@ -833,6 +839,29 @@
 (MULW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (SLLconst [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
 (MULW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (SLLconst [log2(c/9)] (ADDshiftLL <x.Type> x x [3]))
 
+// mneg by constant
+(MNEG x (MOVDconst [-1])) -> x
+(MNEG _ (MOVDconst [0])) -> (MOVDconst [0])
+(MNEG x (MOVDconst [1])) -> (NEG x)
+(MNEG x (MOVDconst [c])) && isPowerOfTwo(c) -> (NEG (SLLconst <x.Type> [log2(c)] x))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo(c-1) && c >= 3 -> (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo(c+1) && c >= 7 -> (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+(MNEG x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) -> (NEG (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])))
+(MNEG x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) -> (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEG x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) -> (NEG (SLLconst <x.Type> [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])))
+(MNEG x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) -> (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+
+(MNEGW x (MOVDconst [c])) && int32(c)==-1 -> x
+(MNEGW _ (MOVDconst [c])) && int32(c)==0 -> (MOVDconst [0])
+(MNEGW x (MOVDconst [c])) && int32(c)==1 -> (NEG x)
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo(c) -> (NEG (SLLconst <x.Type> [log2(c)] x))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo(c-1) && int32(c) >= 3 -> (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo(c+1) && int32(c) >= 7 -> (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+(MNEGW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) -> (NEG (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])))
+(MNEGW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) -> (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEGW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) -> (NEG (SLLconst <x.Type> [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])))
+(MNEGW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) -> (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+
 // div by constant
 (UDIV x (MOVDconst [1])) -> x
 (UDIV x (MOVDconst [c])) && isPowerOfTwo(c) -> (SRLconst [log2(c)] x)
@@ -880,6 +909,8 @@
 (SRAconst [c] (MOVDconst [d]))  -> (MOVDconst [int64(d)>>uint64(c)])
 (MUL   (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c*d])
 (MULW  (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [int64(int32(c)*int32(d))])
+(MNEG  (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [-c*d])
+(MNEGW (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [-int64(int32(c)*int32(d))])
 (DIV   (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [int64(c)/int64(d)])
 (UDIV  (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [int64(uint64(c)/uint64(d))])
 (DIVW  (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [int64(int32(c)/int32(d))])
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
index a5755659b8..6acc9c89f2 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
@@ -165,6 +165,8 @@ func init() {
 		{name: "SUBconst", argLength: 1, reg: gp11, asm: "SUB", aux: "Int64"},     // arg0 - auxInt
 		{name: "MUL", argLength: 2, reg: gp21, asm: "MUL", commutative: true},     // arg0 * arg1
 		{name: "MULW", argLength: 2, reg: gp21, asm: "MULW", commutative: true},   // arg0 * arg1, 32-bit
+		{name: "MNEG", argLength: 2, reg: gp21, asm: "MNEG", commutative: true},   // -arg0 * arg1
+		{name: "MNEGW", argLength: 2, reg: gp21, asm: "MNEGW", commutative: true}, // -arg0 * arg1, 32-bit
 		{name: "MULH", argLength: 2, reg: gp21, asm: "SMULH", commutative: true},  // (arg0 * arg1) >> 64, signed
 		{name: "UMULH", argLength: 2, reg: gp21, asm: "UMULH", commutative: true}, // (arg0 * arg1) >> 64, unsigned
 		{name: "MULL", argLength: 2, reg: gp21, asm: "SMULL", commutative: true},  // arg0 * arg1, signed, 32-bit mult results in 64-bit
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 2dff261ca4..95d50c0672 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -957,6 +957,8 @@ const (
 	OpARM64SUBconst
 	OpARM64MUL
 	OpARM64MULW
+	OpARM64MNEG
+	OpARM64MNEGW
 	OpARM64MULH
 	OpARM64UMULH
 	OpARM64MULL
@@ -12109,6 +12111,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:        "MNEG",
+		argLen:      2,
+		commutative: true,
+		asm:         arm64.AMNEG,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
+	{
+		name:        "MNEGW",
+		argLen:      2,
+		commutative: true,
+		asm:         arm64.AMNEGW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+				{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
+			},
+			outputs: []outputInfo{
+				{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+			},
+		},
+	},
 	{
 		name:        "MULH",
 		argLen:      2,
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index 405a8970fc..1b595e5a50 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -107,6 +107,10 @@ func rewriteValueARM64(v *Value) bool {
 		return rewriteValueARM64_OpARM64LessThan_0(v)
 	case OpARM64LessThanU:
 		return rewriteValueARM64_OpARM64LessThanU_0(v)
+	case OpARM64MNEG:
+		return rewriteValueARM64_OpARM64MNEG_0(v) || rewriteValueARM64_OpARM64MNEG_10(v) || rewriteValueARM64_OpARM64MNEG_20(v)
+	case OpARM64MNEGW:
+		return rewriteValueARM64_OpARM64MNEGW_0(v) || rewriteValueARM64_OpARM64MNEGW_10(v) || rewriteValueARM64_OpARM64MNEGW_20(v)
 	case OpARM64MOD:
 		return rewriteValueARM64_OpARM64MOD_0(v)
 	case OpARM64MODW:
@@ -4060,6 +4064,994 @@ func rewriteValueARM64_OpARM64LessThanU_0(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueARM64_OpARM64MNEG_0(v *Value) bool {
+	b := v.Block
+	_ = b
+	// match: (MNEG x (MOVDconst [-1]))
+	// cond:
+	// result: x
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1.AuxInt != -1 {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEG (MOVDconst [-1]) x)
+	// cond:
+	// result: x
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0.AuxInt != -1 {
+			break
+		}
+		x := v.Args[1]
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEG _ (MOVDconst [0]))
+	// cond:
+	// result: (MOVDconst [0])
+	for {
+		_ = v.Args[1]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1.AuxInt != 0 {
+			break
+		}
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = 0
+		return true
+	}
+	// match: (MNEG (MOVDconst [0]) _)
+	// cond:
+	// result: (MOVDconst [0])
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0.AuxInt != 0 {
+			break
+		}
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = 0
+		return true
+	}
+	// match: (MNEG x (MOVDconst [1]))
+	// cond:
+	// result: (NEG x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_1.AuxInt != 1 {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEG (MOVDconst [1]) x)
+	// cond:
+	// result: (NEG x)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		if v_0.AuxInt != 1 {
+			break
+		}
+		x := v.Args[1]
+		v.reset(OpARM64NEG)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEG x (MOVDconst [c]))
+	// cond: isPowerOfTwo(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c)] x))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(isPowerOfTwo(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG (MOVDconst [c]) x)
+	// cond: isPowerOfTwo(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c)] x))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(isPowerOfTwo(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG x (MOVDconst [c]))
+	// cond: isPowerOfTwo(c-1) && c >= 3
+	// result: (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(isPowerOfTwo(c-1) && c >= 3) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c - 1)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG (MOVDconst [c]) x)
+	// cond: isPowerOfTwo(c-1) && c >= 3
+	// result: (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(isPowerOfTwo(c-1) && c >= 3) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c - 1)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpARM64MNEG_10(v *Value) bool {
+	b := v.Block
+	_ = b
+	// match: (MNEG x (MOVDconst [c]))
+	// cond: isPowerOfTwo(c+1) && c >= 7
+	// result: (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(isPowerOfTwo(c+1) && c >= 7) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c + 1)
+		v1 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG (MOVDconst [c]) x)
+	// cond: isPowerOfTwo(c+1) && c >= 7
+	// result: (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(isPowerOfTwo(c+1) && c >= 7) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c + 1)
+		v1 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG x (MOVDconst [c]))
+	// cond: c%3 == 0 && isPowerOfTwo(c/3)
+	// result: (NEG (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%3 == 0 && isPowerOfTwo(c/3)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 3)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 1
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG (MOVDconst [c]) x)
+	// cond: c%3 == 0 && isPowerOfTwo(c/3)
+	// result: (NEG (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%3 == 0 && isPowerOfTwo(c/3)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 3)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 1
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG x (MOVDconst [c]))
+	// cond: c%5 == 0 && isPowerOfTwo(c/5)
+	// result: (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%5 == 0 && isPowerOfTwo(c/5)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 5)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 2
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG (MOVDconst [c]) x)
+	// cond: c%5 == 0 && isPowerOfTwo(c/5)
+	// result: (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%5 == 0 && isPowerOfTwo(c/5)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 5)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 2
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG x (MOVDconst [c]))
+	// cond: c%7 == 0 && isPowerOfTwo(c/7)
+	// result: (NEG (SLLconst <x.Type> [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%7 == 0 && isPowerOfTwo(c/7)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 7)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v2 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v2.AddArg(x)
+		v1.AddArg(v2)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG (MOVDconst [c]) x)
+	// cond: c%7 == 0 && isPowerOfTwo(c/7)
+	// result: (NEG (SLLconst <x.Type> [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%7 == 0 && isPowerOfTwo(c/7)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 7)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v2 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v2.AddArg(x)
+		v1.AddArg(v2)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG x (MOVDconst [c]))
+	// cond: c%9 == 0 && isPowerOfTwo(c/9)
+	// result: (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%9 == 0 && isPowerOfTwo(c/9)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 9)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEG (MOVDconst [c]) x)
+	// cond: c%9 == 0 && isPowerOfTwo(c/9)
+	// result: (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%9 == 0 && isPowerOfTwo(c/9)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 9)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpARM64MNEG_20(v *Value) bool {
+	// match: (MNEG (MOVDconst [c]) (MOVDconst [d]))
+	// cond:
+	// result: (MOVDconst [-c*d])
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		d := v_1.AuxInt
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = -c * d
+		return true
+	}
+	// match: (MNEG (MOVDconst [d]) (MOVDconst [c]))
+	// cond:
+	// result: (MOVDconst [-c*d])
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		d := v_0.AuxInt
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = -c * d
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpARM64MNEGW_0(v *Value) bool {
+	b := v.Block
+	_ = b
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: int32(c)==-1
+	// result: x
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(int32(c) == -1) {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: int32(c)==-1
+	// result: x
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(int32(c) == -1) {
+			break
+		}
+		v.reset(OpCopy)
+		v.Type = x.Type
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEGW _ (MOVDconst [c]))
+	// cond: int32(c)==0
+	// result: (MOVDconst [0])
+	for {
+		_ = v.Args[1]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(int32(c) == 0) {
+			break
+		}
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = 0
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) _)
+	// cond: int32(c)==0
+	// result: (MOVDconst [0])
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		if !(int32(c) == 0) {
+			break
+		}
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = 0
+		return true
+	}
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: int32(c)==1
+	// result: (NEG x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(int32(c) == 1) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: int32(c)==1
+	// result: (NEG x)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(int32(c) == 1) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: isPowerOfTwo(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c)] x))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(isPowerOfTwo(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: isPowerOfTwo(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c)] x))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(isPowerOfTwo(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: isPowerOfTwo(c-1) && int32(c) >= 3
+	// result: (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(isPowerOfTwo(c-1) && int32(c) >= 3) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c - 1)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: isPowerOfTwo(c-1) && int32(c) >= 3
+	// result: (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(isPowerOfTwo(c-1) && int32(c) >= 3) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c - 1)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpARM64MNEGW_10(v *Value) bool {
+	b := v.Block
+	_ = b
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: isPowerOfTwo(c+1) && int32(c) >= 7
+	// result: (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(isPowerOfTwo(c+1) && int32(c) >= 7) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c + 1)
+		v1 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: isPowerOfTwo(c+1) && int32(c) >= 7
+	// result: (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(isPowerOfTwo(c+1) && int32(c) >= 7) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v0.AuxInt = log2(c + 1)
+		v1 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 3)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 1
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/3)] (ADDshiftLL <x.Type> x x [1])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 3)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 1
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 5)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 2
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 5)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 2
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 7)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v2 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v2.AddArg(x)
+		v1.AddArg(v2)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 7)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v2 := b.NewValue0(v.Pos, OpARM64NEG, x.Type)
+		v2.AddArg(x)
+		v1.AddArg(v2)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW x (MOVDconst [c]))
+	// cond: c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		if !(c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 9)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MNEGW (MOVDconst [c]) x)
+	// cond: c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)
+	// result: (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		x := v.Args[1]
+		if !(c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)) {
+			break
+		}
+		v.reset(OpARM64NEG)
+		v0 := b.NewValue0(v.Pos, OpARM64SLLconst, x.Type)
+		v0.AuxInt = log2(c / 9)
+		v1 := b.NewValue0(v.Pos, OpARM64ADDshiftLL, x.Type)
+		v1.AuxInt = 3
+		v1.AddArg(x)
+		v1.AddArg(x)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpARM64MNEGW_20(v *Value) bool {
+	// match: (MNEGW (MOVDconst [c]) (MOVDconst [d]))
+	// cond:
+	// result: (MOVDconst [-int64(int32(c)*int32(d))])
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_0.AuxInt
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		d := v_1.AuxInt
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = -int64(int32(c) * int32(d))
+		return true
+	}
+	// match: (MNEGW (MOVDconst [d]) (MOVDconst [c]))
+	// cond:
+	// result: (MOVDconst [-int64(int32(c)*int32(d))])
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MOVDconst {
+			break
+		}
+		d := v_0.AuxInt
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = -int64(int32(c) * int32(d))
+		return true
+	}
+	return false
+}
 func rewriteValueARM64_OpARM64MOD_0(v *Value) bool {
 	// match: (MOD (MOVDconst [c]) (MOVDconst [d]))
 	// cond:
@@ -6066,6 +7058,38 @@ func rewriteValueARM64_OpARM64MOVWstorezero_0(v *Value) bool {
 	return false
 }
 func rewriteValueARM64_OpARM64MUL_0(v *Value) bool {
+	// match: (MUL (NEG x) y)
+	// cond:
+	// result: (MNEG x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64NEG {
+			break
+		}
+		x := v_0.Args[0]
+		y := v.Args[1]
+		v.reset(OpARM64MNEG)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (MUL y (NEG x))
+	// cond:
+	// result: (MNEG x y)
+	for {
+		_ = v.Args[1]
+		y := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64NEG {
+			break
+		}
+		x := v_1.Args[0]
+		v.reset(OpARM64MNEG)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
 	// match: (MUL x (MOVDconst [-1]))
 	// cond:
 	// result: (NEG x)
@@ -6206,6 +7230,11 @@ func rewriteValueARM64_OpARM64MUL_0(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64MUL_10(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (MUL x (MOVDconst [c]))
 	// cond: isPowerOfTwo(c-1) && c >= 3
 	// result: (ADDshiftLL x x [log2(c-1)])
@@ -6246,11 +7275,6 @@ func rewriteValueARM64_OpARM64MUL_0(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64MUL_10(v *Value) bool {
-	b := v.Block
-	_ = b
 	// match: (MUL x (MOVDconst [c]))
 	// cond: isPowerOfTwo(c+1) && c >= 7
 	// result: (ADDshiftLL (NEG <x.Type> x) x [log2(c+1)])
@@ -6437,6 +7461,11 @@ func rewriteValueARM64_OpARM64MUL_10(v *Value) bool {
 		v.AddArg(v0)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64MUL_20(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (MUL x (MOVDconst [c]))
 	// cond: c%9 == 0 && isPowerOfTwo(c/9)
 	// result: (SLLconst [log2(c/9)] (ADDshiftLL <x.Type> x x [3]))
@@ -6483,9 +7512,6 @@ func rewriteValueARM64_OpARM64MUL_10(v *Value) bool {
 		v.AddArg(v0)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64MUL_20(v *Value) bool {
 	// match: (MUL (MOVDconst [c]) (MOVDconst [d]))
 	// cond:
 	// result: (MOVDconst [c*d])
@@ -6527,6 +7553,38 @@ func rewriteValueARM64_OpARM64MUL_20(v *Value) bool {
 	return false
 }
 func rewriteValueARM64_OpARM64MULW_0(v *Value) bool {
+	// match: (MULW (NEG x) y)
+	// cond:
+	// result: (MNEGW x y)
+	for {
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64NEG {
+			break
+		}
+		x := v_0.Args[0]
+		y := v.Args[1]
+		v.reset(OpARM64MNEGW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (MULW y (NEG x))
+	// cond:
+	// result: (MNEGW x y)
+	for {
+		_ = v.Args[1]
+		y := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64NEG {
+			break
+		}
+		x := v_1.Args[0]
+		v.reset(OpARM64MNEGW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
 	// match: (MULW x (MOVDconst [c]))
 	// cond: int32(c)==-1
 	// result: (NEG x)
@@ -6673,6 +7731,11 @@ func rewriteValueARM64_OpARM64MULW_0(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64MULW_10(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (MULW x (MOVDconst [c]))
 	// cond: isPowerOfTwo(c-1) && int32(c) >= 3
 	// result: (ADDshiftLL x x [log2(c-1)])
@@ -6713,11 +7776,6 @@ func rewriteValueARM64_OpARM64MULW_0(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64MULW_10(v *Value) bool {
-	b := v.Block
-	_ = b
 	// match: (MULW x (MOVDconst [c]))
 	// cond: isPowerOfTwo(c+1) && int32(c) >= 7
 	// result: (ADDshiftLL (NEG <x.Type> x) x [log2(c+1)])
@@ -6904,6 +7962,11 @@ func rewriteValueARM64_OpARM64MULW_10(v *Value) bool {
 		v.AddArg(v0)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64MULW_20(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (MULW x (MOVDconst [c]))
 	// cond: c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c)
 	// result: (SLLconst [log2(c/9)] (ADDshiftLL <x.Type> x x [3]))
@@ -6950,9 +8013,6 @@ func rewriteValueARM64_OpARM64MULW_10(v *Value) bool {
 		v.AddArg(v0)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64MULW_20(v *Value) bool {
 	// match: (MULW (MOVDconst [c]) (MOVDconst [d]))
 	// cond:
 	// result: (MOVDconst [int64(int32(c)*int32(d))])
@@ -7010,6 +8070,38 @@ func rewriteValueARM64_OpARM64MVN_0(v *Value) bool {
 	return false
 }
 func rewriteValueARM64_OpARM64NEG_0(v *Value) bool {
+	// match: (NEG (MUL x y))
+	// cond:
+	// result: (MNEG x y)
+	for {
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MUL {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		v.reset(OpARM64MNEG)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+	// match: (NEG (MULW x y))
+	// cond:
+	// result: (MNEGW x y)
+	for {
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64MULW {
+			break
+		}
+		_ = v_0.Args[1]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		v.reset(OpARM64MNEGW)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
 	// match: (NEG (MOVDconst [c]))
 	// cond:
 	// result: (MOVDconst [-c])