From c53d1236dfaa900387519810c8de1a9353c55bab Mon Sep 17 00:00:00 2001
From: Alberto Donizetti <alb.donizetti@gmail.com>
Date: Fri, 24 Apr 2020 17:24:16 +0200
Subject: [PATCH] cmd/compile: use typed aux for first half of arm64 lowering

Passes

  GOARCH=arm64 gotip build -toolexec 'toolstash -cmp' -a std

Change-Id: Icb530d8d128d9938ab44a9c716c8dd09a34ededf
Reviewed-on: https://go-review.googlesource.com/c/go/+/229937
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/cmd/compile/internal/ssa/gen/ARM64.rules | 374 +++++++++----------
 src/cmd/compile/internal/ssa/rewriteARM64.go | 324 +++++++++++-----
 2 files changed, 417 insertions(+), 281 deletions(-)
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
index 8478c1c678..9fec8c5526 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -2,130 +2,130 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-(Add(Ptr|64|32|16|8) ...) -> (ADD ...)
-(Add(32F|64F) ...) -> (FADD(S|D) ...)
-
-(Sub(Ptr|64|32|16|8) ...) -> (SUB ...)
-(Sub(32F|64F) ...) -> (FSUB(S|D) ...)
-
-(Mul64 ...) -> (MUL ...)
-(Mul(32|16|8) ...) -> (MULW ...)
-(Mul(32F|64F) ...) -> (FMUL(S|D) ...)
-
-(Hmul64 ...) -> (MULH ...)
-(Hmul64u ...) -> (UMULH ...)
-(Hmul32 x y) -> (SRAconst (MULL <typ.Int64> x y) [32])
-(Hmul32u x y) -> (SRAconst (UMULL <typ.UInt64> x y) [32])
-(Mul64uhilo ...) -> (LoweredMuluhilo ...)
-
-(Div64 ...) -> (DIV ...)
-(Div64u ...) -> (UDIV ...)
-(Div32 ...) -> (DIVW ...)
-(Div32u ...) -> (UDIVW ...)
-(Div16 x y) -> (DIVW (SignExt16to32 x) (SignExt16to32 y))
-(Div16u x y) -> (UDIVW (ZeroExt16to32 x) (ZeroExt16to32 y))
-(Div8 x y) -> (DIVW (SignExt8to32 x) (SignExt8to32 y))
-(Div8u x y) -> (UDIVW (ZeroExt8to32 x) (ZeroExt8to32 y))
-(Div32F ...) -> (FDIVS ...)
-(Div64F ...) -> (FDIVD ...)
-
-(Mod64 ...) -> (MOD ...)
-(Mod64u ...) -> (UMOD ...)
-(Mod32 ...) -> (MODW ...)
-(Mod32u ...) -> (UMODW ...)
-(Mod16 x y) -> (MODW (SignExt16to32 x) (SignExt16to32 y))
-(Mod16u x y) -> (UMODW (ZeroExt16to32 x) (ZeroExt16to32 y))
-(Mod8 x y) -> (MODW (SignExt8to32 x) (SignExt8to32 y))
-(Mod8u x y) -> (UMODW (ZeroExt8to32 x) (ZeroExt8to32 y))
-
-// (x + y) / 2 with x>=y -> (x - y) / 2 + y
-(Avg64u <t> x y) -> (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
-
-(And(64|32|16|8) ...) -> (AND ...)
-(Or(64|32|16|8) ...) -> (OR ...)
-(Xor(64|32|16|8) ...) -> (XOR ...)
+(Add(Ptr|64|32|16|8) ...) => (ADD ...)
+(Add(32F|64F) ...) => (FADD(S|D) ...)
+
+(Sub(Ptr|64|32|16|8) ...) => (SUB ...)
+(Sub(32F|64F) ...) => (FSUB(S|D) ...)
+
+(Mul64 ...) => (MUL ...)
+(Mul(32|16|8) ...) => (MULW ...)
+(Mul(32F|64F) ...) => (FMUL(S|D) ...)
+
+(Hmul64 ...) => (MULH ...)
+(Hmul64u ...) => (UMULH ...)
+(Hmul32 x y) => (SRAconst (MULL <typ.Int64> x y) [32])
+(Hmul32u x y) => (SRAconst (UMULL <typ.UInt64> x y) [32])
+(Mul64uhilo ...) => (LoweredMuluhilo ...)
+
+(Div64 [false] x y) => (DIV x y)
+(Div64u ...) => (UDIV ...)
+(Div32 [false] x y) => (DIVW x y)
+(Div32u ...) => (UDIVW ...)
+(Div16 [false] x y) => (DIVW (SignExt16to32 x) (SignExt16to32 y))
+(Div16u x y) => (UDIVW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Div8 x y) => (DIVW (SignExt8to32 x) (SignExt8to32 y))
+(Div8u x y) => (UDIVW (ZeroExt8to32 x) (ZeroExt8to32 y))
+(Div32F ...) => (FDIVS ...)
+(Div64F ...) => (FDIVD ...)
+
+(Mod64 x y) => (MOD x y)
+(Mod64u ...) => (UMOD ...)
+(Mod32 x y) => (MODW x y)
+(Mod32u ...) => (UMODW ...)
+(Mod16 x y) => (MODW (SignExt16to32 x) (SignExt16to32 y))
+(Mod16u x y) => (UMODW (ZeroExt16to32 x) (ZeroExt16to32 y))
+(Mod8 x y) => (MODW (SignExt8to32 x) (SignExt8to32 y))
+(Mod8u x y) => (UMODW (ZeroExt8to32 x) (ZeroExt8to32 y))
+
+// (x + y) / 2 with x>=y    =>    (x - y) / 2 + y
+(Avg64u <t> x y) => (ADD (SRLconst <t> (SUB <t> x y) [1]) y)
+
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)
 
 // unary ops
-(Neg(64|32|16|8) ...) -> (NEG ...)
-(Neg(32F|64F) ...) -> (FNEG(S|D) ...)
-(Com(64|32|16|8) ...) -> (MVN ...)
+(Neg(64|32|16|8) ...) => (NEG ...)
+(Neg(32F|64F) ...) => (FNEG(S|D) ...)
+(Com(64|32|16|8) ...) => (MVN ...)
 
 // math package intrinsics
-(Abs ...) -> (FABSD ...)
-(Sqrt ...) -> (FSQRTD ...)
-(Ceil ...) -> (FRINTPD ...)
-(Floor ...) -> (FRINTMD ...)
-(Round ...) -> (FRINTAD ...)
-(RoundToEven ...) -> (FRINTND ...)
-(Trunc ...) -> (FRINTZD ...)
-(FMA x y z) -> (FMADDD z x y)
+(Abs ...) => (FABSD ...)
+(Sqrt ...) => (FSQRTD ...)
+(Ceil ...) => (FRINTPD ...)
+(Floor ...) => (FRINTMD ...)
+(Round ...) => (FRINTAD ...)
+(RoundToEven ...) => (FRINTND ...)
+(Trunc ...) => (FRINTZD ...)
+(FMA x y z) => (FMADDD z x y)
 
 // lowering rotates
-(RotateLeft8 <t> x (MOVDconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
-(RotateLeft16 <t> x (MOVDconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
-(RotateLeft32 x y) -> (RORW x (NEG <y.Type> y))
-(RotateLeft64 x y) -> (ROR x (NEG <y.Type> y))
+(RotateLeft8 <t> x (MOVDconst [c])) => (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) => (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft32 x y) => (RORW x (NEG <y.Type> y))
+(RotateLeft64 x y) => (ROR x (NEG <y.Type> y))
 
-(Ctz(64|32|16|8)NonZero ...) -> (Ctz(64|32|32|32) ...)
+(Ctz(64|32|16|8)NonZero ...) => (Ctz(64|32|32|32) ...)
 
-(Ctz64 <t> x) -> (CLZ (RBIT <t> x))
-(Ctz32 <t> x) -> (CLZW (RBITW <t> x))
-(Ctz16 <t> x) -> (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
-(Ctz8 <t> x) -> (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
+(Ctz64 <t> x) => (CLZ (RBIT <t> x))
+(Ctz32 <t> x) => (CLZW (RBITW <t> x))
+(Ctz16 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+(Ctz8 <t> x) => (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
 
-(PopCount64 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> x))))
-(PopCount32 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt32to64 x)))))
-(PopCount16 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt16to64 x)))))
+(PopCount64 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> x))))
+(PopCount32 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt32to64 x)))))
+(PopCount16 <t> x) => (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt16to64 x)))))
 
 // Load args directly into the register class where it will be used.
-(FMOVDgpfp <t> (Arg [off] {sym})) -> @b.Func.Entry (Arg <t> [off] {sym})
-(FMOVDfpgp <t> (Arg [off] {sym})) -> @b.Func.Entry (Arg <t> [off] {sym})
+(FMOVDgpfp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym})
+(FMOVDfpgp <t> (Arg [off] {sym})) => @b.Func.Entry (Arg <t> [off] {sym})
 
-// Similarly for stores, if we see a store after FPR <-> GPR move, then redirect store to use the other register set.
-(MOVDstore [off] {sym} ptr (FMOVDfpgp val) mem) -> (FMOVDstore [off] {sym} ptr val mem)
-(FMOVDstore [off] {sym} ptr (FMOVDgpfp val) mem) -> (MOVDstore [off] {sym} ptr val mem)
-(MOVWstore [off] {sym} ptr (FMOVSfpgp val) mem) -> (FMOVSstore [off] {sym} ptr val mem)
-(FMOVSstore [off] {sym} ptr (FMOVSgpfp val) mem) -> (MOVWstore [off] {sym} ptr val mem)
+// Similarly for stores, if we see a store after FPR <=> GPR move, then redirect store to use the other register set.
+(MOVDstore [off] {sym} ptr (FMOVDfpgp val) mem) => (FMOVDstore [off] {sym} ptr val mem)
+(FMOVDstore [off] {sym} ptr (FMOVDgpfp val) mem) => (MOVDstore [off] {sym} ptr val mem)
+(MOVWstore [off] {sym} ptr (FMOVSfpgp val) mem) => (FMOVSstore [off] {sym} ptr val mem)
+(FMOVSstore [off] {sym} ptr (FMOVSgpfp val) mem) => (MOVWstore [off] {sym} ptr val mem)
 
-// float <-> int register moves, with no conversion.
+// float <=> int register moves, with no conversion.
 // These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}.
-(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr val _)) -> (FMOVDfpgp val)
-(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr val _)) -> (FMOVDgpfp val)
-(MOVWUload [off] {sym} ptr (FMOVSstore [off] {sym} ptr val _)) -> (FMOVSfpgp val)
-(FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) -> (FMOVSgpfp val)
+(MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr val _)) => (FMOVDfpgp val)
+(FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr val _)) => (FMOVDgpfp val)
+(MOVWUload [off] {sym} ptr (FMOVSstore [off] {sym} ptr val _)) => (FMOVSfpgp val)
+(FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) => (FMOVSgpfp val)
 
-(BitLen64 x) -> (SUB (MOVDconst [64]) (CLZ <typ.Int> x))
-(BitLen32 x) -> (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
+(BitLen64 x) => (SUB (MOVDconst [64]) (CLZ <typ.Int> x))
+(BitLen32 x) => (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
 
-(Bswap64 ...) -> (REV ...)
-(Bswap32 ...) -> (REVW ...)
+(Bswap64 ...) => (REV ...)
+(Bswap32 ...) => (REVW ...)
 
-(BitRev64 ...) -> (RBIT ...)
-(BitRev32 ...) -> (RBITW ...)
-(BitRev16 x) -> (SRLconst [48] (RBIT <typ.UInt64> x))
-(BitRev8 x) -> (SRLconst [56] (RBIT <typ.UInt64> x))
+(BitRev64 ...) => (RBIT ...)
+(BitRev32 ...) => (RBITW ...)
+(BitRev16 x) => (SRLconst [48] (RBIT <typ.UInt64> x))
+(BitRev8 x) => (SRLconst [56] (RBIT <typ.UInt64> x))
 
 // In fact, UMOD will be translated into UREM instruction, and UREM is originally translated into
 // UDIV and MSUB instructions. But if there is already an identical UDIV instruction just before or
 // after UREM (case like quo, rem := z/y, z%y), then the second UDIV instruction becomes redundant.
 // The purpose of this rule is to have this extra UDIV instruction removed in CSE pass.
-(UMOD <typ.UInt64> x y) -> (MSUB <typ.UInt64> x y (UDIV <typ.UInt64> x y))
-(UMODW <typ.UInt32> x y) -> (MSUBW <typ.UInt32> x y (UDIVW <typ.UInt32> x y))
+(UMOD <typ.UInt64> x y) => (MSUB <typ.UInt64> x y (UDIV <typ.UInt64> x y))
+(UMODW <typ.UInt32> x y) => (MSUBW <typ.UInt32> x y (UDIVW <typ.UInt32> x y))
 
 // 64-bit addition with carry.
-(Select0 (Add64carry x y c)) -> (Select0 <typ.UInt64> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c))))
-(Select1 (Add64carry x y c)) -> (ADCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c)))))
+(Select0 (Add64carry x y c)) => (Select0 <typ.UInt64> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c))))
+(Select1 (Add64carry x y c)) => (ADCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (ADCSflags x y (Select1 <types.TypeFlags> (ADDSconstflags [-1] c)))))
 
 // 64-bit subtraction with borrowing.
-(Select0 (Sub64borrow x y bo)) -> (Select0 <typ.UInt64> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))
-(Select1 (Sub64borrow x y bo)) -> (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))))
+(Select0 (Sub64borrow x y bo)) => (Select0 <typ.UInt64> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))
+(Select1 (Sub64borrow x y bo)) => (NEG <typ.UInt64> (NGCzerocarry <typ.UInt64> (Select1 <types.TypeFlags> (SBCSflags x y (Select1 <types.TypeFlags> (NEGSflags bo))))))
 
 // boolean ops -- booleans are represented with 0=false, 1=true
-(AndB ...) -> (AND ...)
-(OrB ...) -> (OR ...)
-(EqB x y) -> (XOR (MOVDconst [1]) (XOR <typ.Bool> x y))
-(NeqB ...) -> (XOR ...)
-(Not x) -> (XOR (MOVDconst [1]) x)
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XOR (MOVDconst [1]) (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XOR (MOVDconst [1]) x)
 
 // shifts
 // hardware instruction uses only the low 6 bits of the shift
@@ -193,126 +193,126 @@
 (Rsh8x8  x y) -> (SRA (SignExt8to64 x) (CSEL {OpARM64LessThanU} <y.Type> (ZeroExt8to64  y) (Const64 <y.Type> [63]) (CMPconst [64] (ZeroExt8to64  y))))
 
 // constants
-(Const(64|32|16|8) ...) -> (MOVDconst ...)
-(Const(32F|64F) ...) -> (FMOV(S|D)const ...)
-(ConstNil) -> (MOVDconst [0])
-(ConstBool ...) -> (MOVDconst ...)
+(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])
+(Const(32F|64F) [val]) => (FMOV(S|D)const [float64(val)])
+(ConstNil) => (MOVDconst [0])
+(ConstBool [b]) => (MOVDconst [b2i(b)])
 
-(Slicemask <t> x) -> (SRAconst (NEG <t> x) [63])
+(Slicemask <t> x) => (SRAconst (NEG <t> x) [63])
 
 // truncations
 // Because we ignore high parts of registers, truncates are just copies.
-(Trunc16to8 ...) -> (Copy ...)
-(Trunc32to8 ...) -> (Copy ...)
-(Trunc32to16 ...) -> (Copy ...)
-(Trunc64to8 ...) -> (Copy ...)
-(Trunc64to16 ...) -> (Copy ...)
-(Trunc64to32 ...) -> (Copy ...)
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)
 
 // Zero-/Sign-extensions
-(ZeroExt8to16 ...) -> (MOVBUreg ...)
-(ZeroExt8to32 ...) -> (MOVBUreg ...)
-(ZeroExt16to32 ...) -> (MOVHUreg ...)
-(ZeroExt8to64 ...) -> (MOVBUreg ...)
-(ZeroExt16to64 ...) -> (MOVHUreg ...)
-(ZeroExt32to64 ...) -> (MOVWUreg ...)
-
-(SignExt8to16 ...) -> (MOVBreg ...)
-(SignExt8to32 ...) -> (MOVBreg ...)
-(SignExt16to32 ...) -> (MOVHreg ...)
-(SignExt8to64 ...) -> (MOVBreg ...)
-(SignExt16to64 ...) -> (MOVHreg ...)
-(SignExt32to64 ...) -> (MOVWreg ...)
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
+
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
 
 // float <-> int conversion
-(Cvt32to32F ...) -> (SCVTFWS ...)
-(Cvt32to64F ...) -> (SCVTFWD ...)
-(Cvt64to32F ...) -> (SCVTFS ...)
-(Cvt64to64F ...) -> (SCVTFD ...)
-(Cvt32Uto32F ...) -> (UCVTFWS ...)
-(Cvt32Uto64F ...) -> (UCVTFWD ...)
-(Cvt64Uto32F ...) -> (UCVTFS ...)
-(Cvt64Uto64F ...) -> (UCVTFD ...)
-(Cvt32Fto32 ...) -> (FCVTZSSW ...)
-(Cvt64Fto32 ...) -> (FCVTZSDW ...)
-(Cvt32Fto64 ...) -> (FCVTZSS ...)
-(Cvt64Fto64 ...) -> (FCVTZSD ...)
-(Cvt32Fto32U ...) -> (FCVTZUSW ...)
-(Cvt64Fto32U ...) -> (FCVTZUDW ...)
-(Cvt32Fto64U ...) -> (FCVTZUS ...)
-(Cvt64Fto64U ...) -> (FCVTZUD ...)
-(Cvt32Fto64F ...) -> (FCVTSD ...)
-(Cvt64Fto32F ...) -> (FCVTDS ...)
-
-(CvtBoolToUint8 ...) -> (Copy ...)
-
-(Round32F ...) -> (LoweredRound32F ...)
-(Round64F ...) -> (LoweredRound64F ...)
+(Cvt32to32F ...) => (SCVTFWS ...)
+(Cvt32to64F ...) => (SCVTFWD ...)
+(Cvt64to32F ...) => (SCVTFS ...)
+(Cvt64to64F ...) => (SCVTFD ...)
+(Cvt32Uto32F ...) => (UCVTFWS ...)
+(Cvt32Uto64F ...) => (UCVTFWD ...)
+(Cvt64Uto32F ...) => (UCVTFS ...)
+(Cvt64Uto64F ...) => (UCVTFD ...)
+(Cvt32Fto32 ...) => (FCVTZSSW ...)
+(Cvt64Fto32 ...) => (FCVTZSDW ...)
+(Cvt32Fto64 ...) => (FCVTZSS ...)
+(Cvt64Fto64 ...) => (FCVTZSD ...)
+(Cvt32Fto32U ...) => (FCVTZUSW ...)
+(Cvt64Fto32U ...) => (FCVTZUDW ...)
+(Cvt32Fto64U ...) => (FCVTZUS ...)
+(Cvt64Fto64U ...) => (FCVTZUD ...)
+(Cvt32Fto64F ...) => (FCVTSD ...)
+(Cvt64Fto32F ...) => (FCVTDS ...)
+
+(CvtBoolToUint8 ...) => (Copy ...)
+
+(Round32F ...) => (LoweredRound32F ...)
+(Round64F ...) => (LoweredRound64F ...)
 
 // comparisons
-(Eq8 x y)  -> (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
-(Eq16 x y) -> (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
-(Eq32 x y) -> (Equal (CMPW x y))
-(Eq64 x y) -> (Equal (CMP x y))
-(EqPtr x y) -> (Equal (CMP x y))
-(Eq32F x y) -> (Equal (FCMPS x y))
-(Eq64F x y) -> (Equal (FCMPD x y))
-
-(Neq8 x y)  -> (NotEqual (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
-(Neq16 x y) -> (NotEqual (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
-(Neq32 x y) -> (NotEqual (CMPW x y))
-(Neq64 x y) -> (NotEqual (CMP x y))
-(NeqPtr x y) -> (NotEqual (CMP x y))
-(Neq32F x y) -> (NotEqual (FCMPS x y))
-(Neq64F x y) -> (NotEqual (FCMPD x y))
-
-(Less8 x y)  -> (LessThan (CMPW (SignExt8to32 x) (SignExt8to32 y)))
-(Less16 x y) -> (LessThan (CMPW (SignExt16to32 x) (SignExt16to32 y)))
-(Less32 x y) -> (LessThan (CMPW x y))
-(Less64 x y) -> (LessThan (CMP x y))
+(Eq8 x y)  => (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Eq16 x y) => (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Eq32 x y) => (Equal (CMPW x y))
+(Eq64 x y) => (Equal (CMP x y))
+(EqPtr x y) => (Equal (CMP x y))
+(Eq32F x y) => (Equal (FCMPS x y))
+(Eq64F x y) => (Equal (FCMPD x y))
+
+(Neq8 x y)  => (NotEqual (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Neq16 x y) => (NotEqual (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Neq32 x y) => (NotEqual (CMPW x y))
+(Neq64 x y) => (NotEqual (CMP x y))
+(NeqPtr x y) => (NotEqual (CMP x y))
+(Neq32F x y) => (NotEqual (FCMPS x y))
+(Neq64F x y) => (NotEqual (FCMPD x y))
+
+(Less8 x y)  => (LessThan (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Less16 x y) => (LessThan (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Less32 x y) => (LessThan (CMPW x y))
+(Less64 x y) => (LessThan (CMP x y))
 
 // Set condition flags for floating-point comparisons "x < y"
 // and "x <= y". Because if either or both of the operands are
 // NaNs, all three of (x < y), (x == y) and (x > y) are false,
 // and ARM Manual says FCMP instruction sets PSTATE.<N,Z,C,V>
 // of this case to (0, 0, 1, 1).
-(Less32F x y) -> (LessThanF (FCMPS x y))
-(Less64F x y) -> (LessThanF (FCMPD x y))
+(Less32F x y) => (LessThanF (FCMPS x y))
+(Less64F x y) => (LessThanF (FCMPD x y))
 
-(Less8U x y)  -> (LessThanU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
-(Less16U x y) -> (LessThanU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
-(Less32U x y) -> (LessThanU (CMPW x y))
-(Less64U x y) -> (LessThanU (CMP x y))
+(Less8U x y)  => (LessThanU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Less16U x y) => (LessThanU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Less32U x y) => (LessThanU (CMPW x y))
+(Less64U x y) => (LessThanU (CMP x y))
 
-(Leq8 x y)  -> (LessEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
-(Leq16 x y) -> (LessEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
-(Leq32 x y) -> (LessEqual (CMPW x y))
-(Leq64 x y) -> (LessEqual (CMP x y))
+(Leq8 x y)  => (LessEqual (CMPW (SignExt8to32 x) (SignExt8to32 y)))
+(Leq16 x y) => (LessEqual (CMPW (SignExt16to32 x) (SignExt16to32 y)))
+(Leq32 x y) => (LessEqual (CMPW x y))
+(Leq64 x y) => (LessEqual (CMP x y))
 
 // Refer to the comments for op Less64F above.
-(Leq32F x y) -> (LessEqualF (FCMPS x y))
-(Leq64F x y) -> (LessEqualF (FCMPD x y))
+(Leq32F x y) => (LessEqualF (FCMPS x y))
+(Leq64F x y) => (LessEqualF (FCMPD x y))
 
-(Leq8U x y)  -> (LessEqualU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
-(Leq16U x y) -> (LessEqualU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
-(Leq32U x y) -> (LessEqualU (CMPW x y))
-(Leq64U x y) -> (LessEqualU (CMP x y))
+(Leq8U x y)  => (LessEqualU (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y)))
+(Leq16U x y) => (LessEqualU (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y)))
+(Leq32U x y) => (LessEqualU (CMPW x y))
+(Leq64U x y) => (LessEqualU (CMP x y))
 
 // Optimize comparison between a floating-point value and 0.0 with "FCMP $(0.0), Fn"
-(FCMPS x (FMOVSconst [0])) -> (FCMPS0 x)
-(FCMPS (FMOVSconst [0]) x) -> (InvertFlags (FCMPS0 x))
-(FCMPD x (FMOVDconst [0])) -> (FCMPD0 x)
-(FCMPD (FMOVDconst [0]) x) -> (InvertFlags (FCMPD0 x))
+(FCMPS x (FMOVSconst [0])) => (FCMPS0 x)
+(FCMPS (FMOVSconst [0]) x) => (InvertFlags (FCMPS0 x))
+(FCMPD x (FMOVDconst [0])) => (FCMPD0 x)
+(FCMPD (FMOVDconst [0]) x) => (InvertFlags (FCMPD0 x))
 
 // CSEL needs a flag-generating argument. Synthesize a CMPW if necessary.
 (CondSelect x y boolval) && flagArg(boolval) != nil -> (CSEL {boolval.Op} x y flagArg(boolval))
 (CondSelect x y boolval) && flagArg(boolval) == nil -> (CSEL {OpARM64NotEqual} x y (CMPWconst [0] boolval))
 
-(OffPtr [off] ptr:(SP)) -> (MOVDaddr [off] ptr)
-(OffPtr [off] ptr) -> (ADDconst [off] ptr)
+(OffPtr [off] ptr:(SP)) && is32Bit(off) => (MOVDaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [off] ptr)
 
-(Addr ...) -> (MOVDaddr ...)
-(LocalAddr {sym} base _) -> (MOVDaddr {sym} base)
+(Addr {sym} base) => (MOVDaddr {sym} base)
+(LocalAddr {sym} base _) => (MOVDaddr {sym} base)
 
 // loads
 (Load <t> ptr mem) && t.IsBoolean() -> (MOVBUload ptr mem)
diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go
index fd42ec8e21..2c18a70581 100644
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -396,8 +396,7 @@ func rewriteValueARM64(v *Value) bool {
 		v.Op = OpARM64ADD
 		return true
 	case OpAddr:
-		v.Op = OpARM64MOVDaddr
-		return true
+		return rewriteValueARM64_OpAddr(v)
 	case OpAnd16:
 		v.Op = OpARM64AND
 		return true
@@ -508,26 +507,19 @@ func rewriteValueARM64(v *Value) bool {
 	case OpCondSelect:
 		return rewriteValueARM64_OpCondSelect(v)
 	case OpConst16:
-		v.Op = OpARM64MOVDconst
-		return true
+		return rewriteValueARM64_OpConst16(v)
 	case OpConst32:
-		v.Op = OpARM64MOVDconst
-		return true
+		return rewriteValueARM64_OpConst32(v)
 	case OpConst32F:
-		v.Op = OpARM64FMOVSconst
-		return true
+		return rewriteValueARM64_OpConst32F(v)
 	case OpConst64:
-		v.Op = OpARM64MOVDconst
-		return true
+		return rewriteValueARM64_OpConst64(v)
 	case OpConst64F:
-		v.Op = OpARM64FMOVDconst
-		return true
+		return rewriteValueARM64_OpConst64F(v)
 	case OpConst8:
-		v.Op = OpARM64MOVDconst
-		return true
+		return rewriteValueARM64_OpConst8(v)
 	case OpConstBool:
-		v.Op = OpARM64MOVDconst
-		return true
+		return rewriteValueARM64_OpConstBool(v)
 	case OpConstNil:
 		return rewriteValueARM64_OpConstNil(v)
 	case OpCtz16:
@@ -612,8 +604,7 @@ func rewriteValueARM64(v *Value) bool {
 	case OpDiv16u:
 		return rewriteValueARM64_OpDiv16u(v)
 	case OpDiv32:
-		v.Op = OpARM64DIVW
-		return true
+		return rewriteValueARM64_OpDiv32(v)
 	case OpDiv32F:
 		v.Op = OpARM64FDIVS
 		return true
@@ -621,8 +612,7 @@ func rewriteValueARM64(v *Value) bool {
 		v.Op = OpARM64UDIVW
 		return true
 	case OpDiv64:
-		v.Op = OpARM64DIV
-		return true
+		return rewriteValueARM64_OpDiv64(v)
 	case OpDiv64F:
 		v.Op = OpARM64FDIVD
 		return true
@@ -763,14 +753,12 @@ func rewriteValueARM64(v *Value) bool {
 	case OpMod16u:
 		return rewriteValueARM64_OpMod16u(v)
 	case OpMod32:
-		v.Op = OpARM64MODW
-		return true
+		return rewriteValueARM64_OpMod32(v)
 	case OpMod32u:
 		v.Op = OpARM64UMODW
 		return true
 	case OpMod64:
-		v.Op = OpARM64MOD
-		return true
+		return rewriteValueARM64_OpMod64(v)
 	case OpMod64u:
 		v.Op = OpARM64UMOD
 		return true
@@ -4002,7 +3990,7 @@ func rewriteValueARM64_OpARM64FCMPD(v *Value) bool {
 	// result: (FCMPD0 x)
 	for {
 		x := v_0
-		if v_1.Op != OpARM64FMOVDconst || v_1.AuxInt != 0 {
+		if v_1.Op != OpARM64FMOVDconst || auxIntToFloat64(v_1.AuxInt) != 0 {
 			break
 		}
 		v.reset(OpARM64FCMPD0)
@@ -4012,7 +4000,7 @@ func rewriteValueARM64_OpARM64FCMPD(v *Value) bool {
 	// match: (FCMPD (FMOVDconst [0]) x)
 	// result: (InvertFlags (FCMPD0 x))
 	for {
-		if v_0.Op != OpARM64FMOVDconst || v_0.AuxInt != 0 {
+		if v_0.Op != OpARM64FMOVDconst || auxIntToFloat64(v_0.AuxInt) != 0 {
 			break
 		}
 		x := v_1
@@ -4032,7 +4020,7 @@ func rewriteValueARM64_OpARM64FCMPS(v *Value) bool {
 	// result: (FCMPS0 x)
 	for {
 		x := v_0
-		if v_1.Op != OpARM64FMOVSconst || v_1.AuxInt != 0 {
+		if v_1.Op != OpARM64FMOVSconst || auxIntToFloat64(v_1.AuxInt) != 0 {
 			break
 		}
 		v.reset(OpARM64FCMPS0)
@@ -4042,7 +4030,7 @@ func rewriteValueARM64_OpARM64FCMPS(v *Value) bool {
 	// match: (FCMPS (FMOVSconst [0]) x)
 	// result: (InvertFlags (FCMPS0 x))
 	for {
-		if v_0.Op != OpARM64FMOVSconst || v_0.AuxInt != 0 {
+		if v_0.Op != OpARM64FMOVSconst || auxIntToFloat64(v_0.AuxInt) != 0 {
 			break
 		}
 		x := v_1
@@ -4064,13 +4052,13 @@ func rewriteValueARM64_OpARM64FMOVDfpgp(v *Value) bool {
 		if v_0.Op != OpArg {
 			break
 		}
-		off := v_0.AuxInt
-		sym := v_0.Aux
+		off := auxIntToInt32(v_0.AuxInt)
+		sym := auxToSym(v_0.Aux)
 		b = b.Func.Entry
 		v0 := b.NewValue0(v.Pos, OpArg, t)
 		v.copyOf(v0)
-		v0.AuxInt = off
-		v0.Aux = sym
+		v0.AuxInt = int32ToAuxInt(off)
+		v0.Aux = symToAux(sym)
 		return true
 	}
 	return false
@@ -4085,13 +4073,13 @@ func rewriteValueARM64_OpARM64FMOVDgpfp(v *Value) bool {
 		if v_0.Op != OpArg {
 			break
 		}
-		off := v_0.AuxInt
-		sym := v_0.Aux
+		off := auxIntToInt32(v_0.AuxInt)
+		sym := auxToSym(v_0.Aux)
 		b = b.Func.Entry
 		v0 := b.NewValue0(v.Pos, OpArg, t)
 		v.copyOf(v0)
-		v0.AuxInt = off
-		v0.Aux = sym
+		v0.AuxInt = int32ToAuxInt(off)
+		v0.Aux = symToAux(sym)
 		return true
 	}
 	return false
@@ -4104,10 +4092,10 @@ func rewriteValueARM64_OpARM64FMOVDload(v *Value) bool {
 	// match: (FMOVDload [off] {sym} ptr (MOVDstore [off] {sym} ptr val _))
 	// result: (FMOVDgpfp val)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
-		if v_1.Op != OpARM64MOVDstore || v_1.AuxInt != off || v_1.Aux != sym {
+		if v_1.Op != OpARM64MOVDstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
 			break
 		}
 		val := v_1.Args[1]
@@ -4225,8 +4213,8 @@ func rewriteValueARM64_OpARM64FMOVDstore(v *Value) bool {
 	// match: (FMOVDstore [off] {sym} ptr (FMOVDgpfp val) mem)
 	// result: (MOVDstore [off] {sym} ptr val mem)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
 		if v_1.Op != OpARM64FMOVDgpfp {
 			break
@@ -4234,8 +4222,8 @@ func rewriteValueARM64_OpARM64FMOVDstore(v *Value) bool {
 		val := v_1.Args[0]
 		mem := v_2
 		v.reset(OpARM64MOVDstore)
-		v.AuxInt = off
-		v.Aux = sym
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, val, mem)
 		return true
 	}
@@ -4351,10 +4339,10 @@ func rewriteValueARM64_OpARM64FMOVSload(v *Value) bool {
 	// match: (FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _))
 	// result: (FMOVSgpfp val)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
-		if v_1.Op != OpARM64MOVWstore || v_1.AuxInt != off || v_1.Aux != sym {
+		if v_1.Op != OpARM64MOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
 			break
 		}
 		val := v_1.Args[1]
@@ -4472,8 +4460,8 @@ func rewriteValueARM64_OpARM64FMOVSstore(v *Value) bool {
 	// match: (FMOVSstore [off] {sym} ptr (FMOVSgpfp val) mem)
 	// result: (MOVWstore [off] {sym} ptr val mem)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
 		if v_1.Op != OpARM64FMOVSgpfp {
 			break
@@ -4481,8 +4469,8 @@ func rewriteValueARM64_OpARM64FMOVSstore(v *Value) bool {
 		val := v_1.Args[0]
 		mem := v_2
 		v.reset(OpARM64MOVWstore)
-		v.AuxInt = off
-		v.Aux = sym
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, val, mem)
 		return true
 	}
@@ -9253,10 +9241,10 @@ func rewriteValueARM64_OpARM64MOVDload(v *Value) bool {
 	// match: (MOVDload [off] {sym} ptr (FMOVDstore [off] {sym} ptr val _))
 	// result: (FMOVDfpgp val)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
-		if v_1.Op != OpARM64FMOVDstore || v_1.AuxInt != off || v_1.Aux != sym {
+		if v_1.Op != OpARM64FMOVDstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
 			break
 		}
 		val := v_1.Args[1]
@@ -9535,8 +9523,8 @@ func rewriteValueARM64_OpARM64MOVDstore(v *Value) bool {
 	// match: (MOVDstore [off] {sym} ptr (FMOVDfpgp val) mem)
 	// result: (FMOVDstore [off] {sym} ptr val mem)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
 		if v_1.Op != OpARM64FMOVDfpgp {
 			break
@@ -9544,8 +9532,8 @@ func rewriteValueARM64_OpARM64MOVDstore(v *Value) bool {
 		val := v_1.Args[0]
 		mem := v_2
 		v.reset(OpARM64FMOVDstore)
-		v.AuxInt = off
-		v.Aux = sym
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, val, mem)
 		return true
 	}
@@ -12040,10 +12028,10 @@ func rewriteValueARM64_OpARM64MOVWUload(v *Value) bool {
 	// match: (MOVWUload [off] {sym} ptr (FMOVSstore [off] {sym} ptr val _))
 	// result: (FMOVSfpgp val)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
-		if v_1.Op != OpARM64FMOVSstore || v_1.AuxInt != off || v_1.Aux != sym {
+		if v_1.Op != OpARM64FMOVSstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
 			break
 		}
 		val := v_1.Args[1]
@@ -12921,8 +12909,8 @@ func rewriteValueARM64_OpARM64MOVWstore(v *Value) bool {
 	// match: (MOVWstore [off] {sym} ptr (FMOVSfpgp val) mem)
 	// result: (FMOVSstore [off] {sym} ptr val mem)
 	for {
-		off := v.AuxInt
-		sym := v.Aux
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
 		ptr := v_0
 		if v_1.Op != OpARM64FMOVSfpgp {
 			break
@@ -12930,8 +12918,8 @@ func rewriteValueARM64_OpARM64MOVWstore(v *Value) bool {
 		val := v_1.Args[0]
 		mem := v_2
 		v.reset(OpARM64FMOVSstore)
-		v.AuxInt = off
-		v.Aux = sym
+		v.AuxInt = int32ToAuxInt(off)
+		v.Aux = symToAux(sym)
 		v.AddArg3(ptr, val, mem)
 		return true
 	}
@@ -21828,6 +21816,19 @@ func rewriteValueARM64_OpARM64XORshiftRL(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueARM64_OpAddr(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Addr {sym} base)
+	// result: (MOVDaddr {sym} base)
+	for {
+		sym := auxToSym(v.Aux)
+		base := v_0
+		v.reset(OpARM64MOVDaddr)
+		v.Aux = symToAux(sym)
+		v.AddArg(base)
+		return true
+	}
+}
 func rewriteValueARM64_OpAtomicAnd8(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -21878,7 +21879,7 @@ func rewriteValueARM64_OpAvg64u(v *Value) bool {
 		y := v_1
 		v.reset(OpARM64ADD)
 		v0 := b.NewValue0(v.Pos, OpARM64SRLconst, t)
-		v0.AuxInt = 1
+		v0.AuxInt = int64ToAuxInt(1)
 		v1 := b.NewValue0(v.Pos, OpARM64SUB, t)
 		v1.AddArg2(x, y)
 		v0.AddArg(v1)
@@ -21896,7 +21897,7 @@ func rewriteValueARM64_OpBitLen32(v *Value) bool {
 		x := v_0
 		v.reset(OpARM64SUB)
 		v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v0.AuxInt = 32
+		v0.AuxInt = int64ToAuxInt(32)
 		v1 := b.NewValue0(v.Pos, OpARM64CLZW, typ.Int)
 		v1.AddArg(x)
 		v.AddArg2(v0, v1)
@@ -21913,7 +21914,7 @@ func rewriteValueARM64_OpBitLen64(v *Value) bool {
 		x := v_0
 		v.reset(OpARM64SUB)
 		v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v0.AuxInt = 64
+		v0.AuxInt = int64ToAuxInt(64)
 		v1 := b.NewValue0(v.Pos, OpARM64CLZ, typ.Int)
 		v1.AddArg(x)
 		v.AddArg2(v0, v1)
@@ -21929,7 +21930,7 @@ func rewriteValueARM64_OpBitRev16(v *Value) bool {
 	for {
 		x := v_0
 		v.reset(OpARM64SRLconst)
-		v.AuxInt = 48
+		v.AuxInt = int64ToAuxInt(48)
 		v0 := b.NewValue0(v.Pos, OpARM64RBIT, typ.UInt64)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -21945,7 +21946,7 @@ func rewriteValueARM64_OpBitRev8(v *Value) bool {
 	for {
 		x := v_0
 		v.reset(OpARM64SRLconst)
-		v.AuxInt = 56
+		v.AuxInt = int64ToAuxInt(56)
 		v0 := b.NewValue0(v.Pos, OpARM64RBIT, typ.UInt64)
 		v0.AddArg(x)
 		v.AddArg(v0)
@@ -21992,12 +21993,82 @@ func rewriteValueARM64_OpCondSelect(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueARM64_OpConst16(v *Value) bool {
+	// match: (Const16 [val])
+	// result: (MOVDconst [int64(val)])
+	for {
+		val := auxIntToInt16(v.AuxInt)
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = int64ToAuxInt(int64(val))
+		return true
+	}
+}
+func rewriteValueARM64_OpConst32(v *Value) bool {
+	// match: (Const32 [val])
+	// result: (MOVDconst [int64(val)])
+	for {
+		val := auxIntToInt32(v.AuxInt)
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = int64ToAuxInt(int64(val))
+		return true
+	}
+}
+func rewriteValueARM64_OpConst32F(v *Value) bool {
+	// match: (Const32F [val])
+	// result: (FMOVSconst [float64(val)])
+	for {
+		val := auxIntToFloat32(v.AuxInt)
+		v.reset(OpARM64FMOVSconst)
+		v.AuxInt = float64ToAuxInt(float64(val))
+		return true
+	}
+}
+func rewriteValueARM64_OpConst64(v *Value) bool {
+	// match: (Const64 [val])
+	// result: (MOVDconst [int64(val)])
+	for {
+		val := auxIntToInt64(v.AuxInt)
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = int64ToAuxInt(int64(val))
+		return true
+	}
+}
+func rewriteValueARM64_OpConst64F(v *Value) bool {
+	// match: (Const64F [val])
+	// result: (FMOVDconst [float64(val)])
+	for {
+		val := auxIntToFloat64(v.AuxInt)
+		v.reset(OpARM64FMOVDconst)
+		v.AuxInt = float64ToAuxInt(float64(val))
+		return true
+	}
+}
+func rewriteValueARM64_OpConst8(v *Value) bool {
+	// match: (Const8 [val])
+	// result: (MOVDconst [int64(val)])
+	for {
+		val := auxIntToInt8(v.AuxInt)
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = int64ToAuxInt(int64(val))
+		return true
+	}
+}
+func rewriteValueARM64_OpConstBool(v *Value) bool {
+	// match: (ConstBool [b])
+	// result: (MOVDconst [b2i(b)])
+	for {
+		b := auxIntToBool(v.AuxInt)
+		v.reset(OpARM64MOVDconst)
+		v.AuxInt = int64ToAuxInt(b2i(b))
+		return true
+	}
+}
 func rewriteValueARM64_OpConstNil(v *Value) bool {
 	// match: (ConstNil)
 	// result: (MOVDconst [0])
 	for {
 		v.reset(OpARM64MOVDconst)
-		v.AuxInt = 0
+		v.AuxInt = int64ToAuxInt(0)
 		return true
 	}
 }
@@ -22014,7 +22085,7 @@ func rewriteValueARM64_OpCtz16(v *Value) bool {
 		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpARM64RBITW, typ.UInt32)
 		v1 := b.NewValue0(v.Pos, OpARM64ORconst, typ.UInt32)
-		v1.AuxInt = 0x10000
+		v1.AuxInt = int64ToAuxInt(0x10000)
 		v1.AddArg(x)
 		v0.AddArg(v1)
 		v.AddArg(v0)
@@ -22064,7 +22135,7 @@ func rewriteValueARM64_OpCtz8(v *Value) bool {
 		v.Type = t
 		v0 := b.NewValue0(v.Pos, OpARM64RBITW, typ.UInt32)
 		v1 := b.NewValue0(v.Pos, OpARM64ORconst, typ.UInt32)
-		v1.AuxInt = 0x100
+		v1.AuxInt = int64ToAuxInt(0x100)
 		v1.AddArg(x)
 		v0.AddArg(v1)
 		v.AddArg(v0)
@@ -22076,9 +22147,12 @@ func rewriteValueARM64_OpDiv16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	typ := &b.Func.Config.Types
-	// match: (Div16 x y)
+	// match: (Div16 [false] x y)
 	// result: (DIVW (SignExt16to32 x) (SignExt16to32 y))
 	for {
+		if auxIntToBool(v.AuxInt) != false {
+			break
+		}
 		x := v_0
 		y := v_1
 		v.reset(OpARM64DIVW)
@@ -22089,6 +22163,7 @@ func rewriteValueARM64_OpDiv16(v *Value) bool {
 		v.AddArg2(v0, v1)
 		return true
 	}
+	return false
 }
 func rewriteValueARM64_OpDiv16u(v *Value) bool {
 	v_1 := v.Args[1]
@@ -22109,6 +22184,40 @@ func rewriteValueARM64_OpDiv16u(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueARM64_OpDiv32(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (Div32 [false] x y)
+	// result: (DIVW x y)
+	for {
+		if auxIntToBool(v.AuxInt) != false {
+			break
+		}
+		x := v_0
+		y := v_1
+		v.reset(OpARM64DIVW)
+		v.AddArg2(x, y)
+		return true
+	}
+	return false
+}
+func rewriteValueARM64_OpDiv64(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (Div64 [false] x y)
+	// result: (DIV x y)
+	for {
+		if auxIntToBool(v.AuxInt) != false {
+			break
+		}
+		x := v_0
+		y := v_1
+		v.reset(OpARM64DIV)
+		v.AddArg2(x, y)
+		return true
+	}
+	return false
+}
 func rewriteValueARM64_OpDiv8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -22265,7 +22374,7 @@ func rewriteValueARM64_OpEqB(v *Value) bool {
 		y := v_1
 		v.reset(OpARM64XOR)
 		v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v0.AuxInt = 1
+		v0.AuxInt = int64ToAuxInt(1)
 		v1 := b.NewValue0(v.Pos, OpARM64XOR, typ.Bool)
 		v1.AddArg2(x, y)
 		v.AddArg2(v0, v1)
@@ -22314,7 +22423,7 @@ func rewriteValueARM64_OpHmul32(v *Value) bool {
 		x := v_0
 		y := v_1
 		v.reset(OpARM64SRAconst)
-		v.AuxInt = 32
+		v.AuxInt = int64ToAuxInt(32)
 		v0 := b.NewValue0(v.Pos, OpARM64MULL, typ.Int64)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
@@ -22332,7 +22441,7 @@ func rewriteValueARM64_OpHmul32u(v *Value) bool {
 		x := v_0
 		y := v_1
 		v.reset(OpARM64SRAconst)
-		v.AuxInt = 32
+		v.AuxInt = int64ToAuxInt(32)
 		v0 := b.NewValue0(v.Pos, OpARM64UMULL, typ.UInt64)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
@@ -22896,10 +23005,10 @@ func rewriteValueARM64_OpLocalAddr(v *Value) bool {
 	// match: (LocalAddr {sym} base _)
 	// result: (MOVDaddr {sym} base)
 	for {
-		sym := v.Aux
+		sym := auxToSym(v.Aux)
 		base := v_0
 		v.reset(OpARM64MOVDaddr)
-		v.Aux = sym
+		v.Aux = symToAux(sym)
 		v.AddArg(base)
 		return true
 	}
@@ -23346,6 +23455,32 @@ func rewriteValueARM64_OpMod16u(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueARM64_OpMod32(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (Mod32 x y)
+	// result: (MODW x y)
+	for {
+		x := v_0
+		y := v_1
+		v.reset(OpARM64MODW)
+		v.AddArg2(x, y)
+		return true
+	}
+}
+func rewriteValueARM64_OpMod64(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	// match: (Mod64 x y)
+	// result: (MOD x y)
+	for {
+		x := v_0
+		y := v_1
+		v.reset(OpARM64MOD)
+		v.AddArg2(x, y)
+		return true
+	}
+}
 func rewriteValueARM64_OpMod8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -23835,7 +23970,7 @@ func rewriteValueARM64_OpNot(v *Value) bool {
 		x := v_0
 		v.reset(OpARM64XOR)
 		v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v0.AuxInt = 1
+		v0.AuxInt = int64ToAuxInt(1)
 		v.AddArg2(v0, x)
 		return true
 	}
@@ -23843,25 +23978,26 @@ func rewriteValueARM64_OpNot(v *Value) bool {
 func rewriteValueARM64_OpOffPtr(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (OffPtr [off] ptr:(SP))
-	// result: (MOVDaddr [off] ptr)
+	// cond: is32Bit(off)
+	// result: (MOVDaddr [int32(off)] ptr)
 	for {
-		off := v.AuxInt
+		off := auxIntToInt64(v.AuxInt)
 		ptr := v_0
-		if ptr.Op != OpSP {
+		if ptr.Op != OpSP || !(is32Bit(off)) {
 			break
 		}
 		v.reset(OpARM64MOVDaddr)
-		v.AuxInt = off
+		v.AuxInt = int32ToAuxInt(int32(off))
 		v.AddArg(ptr)
 		return true
 	}
 	// match: (OffPtr [off] ptr)
 	// result: (ADDconst [off] ptr)
 	for {
-		off := v.AuxInt
+		off := auxIntToInt64(v.AuxInt)
 		ptr := v_0
 		v.reset(OpARM64ADDconst)
-		v.AuxInt = off
+		v.AuxInt = int64ToAuxInt(off)
 		v.AddArg(ptr)
 		return true
 	}
@@ -24000,15 +24136,15 @@ func rewriteValueARM64_OpRotateLeft16(v *Value) bool {
 		if v_1.Op != OpARM64MOVDconst {
 			break
 		}
-		c := v_1.AuxInt
+		c := auxIntToInt64(v_1.AuxInt)
 		v.reset(OpOr16)
 		v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
 		v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v1.AuxInt = c & 15
+		v1.AuxInt = int64ToAuxInt(c & 15)
 		v0.AddArg2(x, v1)
 		v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
 		v3 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v3.AuxInt = -c & 15
+		v3.AuxInt = int64ToAuxInt(-c & 15)
 		v2.AddArg2(x, v3)
 		v.AddArg2(v0, v2)
 		return true
@@ -24060,15 +24196,15 @@ func rewriteValueARM64_OpRotateLeft8(v *Value) bool {
 		if v_1.Op != OpARM64MOVDconst {
 			break
 		}
-		c := v_1.AuxInt
+		c := auxIntToInt64(v_1.AuxInt)
 		v.reset(OpOr8)
 		v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
 		v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v1.AuxInt = c & 7
+		v1.AuxInt = int64ToAuxInt(c & 7)
 		v0.AddArg2(x, v1)
 		v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
 		v3 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
-		v3.AuxInt = -c & 7
+		v3.AuxInt = int64ToAuxInt(-c & 7)
 		v2.AddArg2(x, v3)
 		v.AddArg2(v0, v2)
 		return true
@@ -24939,7 +25075,7 @@ func rewriteValueARM64_OpSelect0(v *Value) bool {
 		v0 := b.NewValue0(v.Pos, OpARM64ADCSflags, types.NewTuple(typ.UInt64, types.TypeFlags))
 		v1 := b.NewValue0(v.Pos, OpSelect1, types.TypeFlags)
 		v2 := b.NewValue0(v.Pos, OpARM64ADDSconstflags, types.NewTuple(typ.UInt64, types.TypeFlags))
-		v2.AuxInt = -1
+		v2.AuxInt = int64ToAuxInt(-1)
 		v2.AddArg(c)
 		v1.AddArg(v2)
 		v0.AddArg3(x, y, v1)
@@ -24987,7 +25123,7 @@ func rewriteValueARM64_OpSelect1(v *Value) bool {
 		v1 := b.NewValue0(v.Pos, OpARM64ADCSflags, types.NewTuple(typ.UInt64, types.TypeFlags))
 		v2 := b.NewValue0(v.Pos, OpSelect1, types.TypeFlags)
 		v3 := b.NewValue0(v.Pos, OpARM64ADDSconstflags, types.NewTuple(typ.UInt64, types.TypeFlags))
-		v3.AuxInt = -1
+		v3.AuxInt = int64ToAuxInt(-1)
 		v3.AddArg(c)
 		v2.AddArg(v3)
 		v1.AddArg3(x, y, v2)
@@ -25030,7 +25166,7 @@ func rewriteValueARM64_OpSlicemask(v *Value) bool {
 		t := v.Type
 		x := v_0
 		v.reset(OpARM64SRAconst)
-		v.AuxInt = 63
+		v.AuxInt = int64ToAuxInt(63)
 		v0 := b.NewValue0(v.Pos, OpARM64NEG, t)
 		v0.AddArg(x)
 		v.AddArg(v0)
-- 
2.50.0