]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: fix ephemeral pointer problem on amd64
authorKeith Randall <khr@golang.org>
Sat, 28 Mar 2020 05:03:33 +0000 (22:03 -0700)
committerKeith Randall <khr@golang.org>
Mon, 30 Mar 2020 17:25:29 +0000 (17:25 +0000)
Make sure we don't use the rewrite ptr + (c + x) -> c + (ptr + x), as
that may create an ephemeral out-of-bounds pointer.

I have not seen an actual bug caused by this yet, but we've seen
them in the 386 port so I'm fixing this issue for amd64 as well.

The load-combining rules needed to be reworked somewhat to still
work without the above broken rule.

Update #37881

Change-Id: I8046d170e89e2035195f261535e34ca7d8aca68a
Reviewed-on: https://go-review.googlesource.com/c/go/+/226437
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/rewrite.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
test/codegen/memcombine.go

index ca5962f2494a5086d6ec0ca75628180a6f62d868..b5133d6c14dd132806538cd34ef84b91a4ccd458 100644 (file)
 
 // Little-endian loads
 
-(ORL                  x0:(MOVBload [i0] {s} p0 mem)
-    sh:(SHLLconst [8] x1:(MOVBload [i1] {s} p1 mem)))
+(OR(L|Q)                  x0:(MOVBload [i0] {s} p mem)
+    sh:(SHL(L|Q)const [8] x1:(MOVBload [i1] {s} p mem)))
   && i1 == i0+1
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
 
-(ORQ                  x0:(MOVBload [i0] {s} p0 mem)
-    sh:(SHLQconst [8] x1:(MOVBload [i1] {s} p1 mem)))
-  && i1 == i0+1
+(OR(L|Q)                  x0:(MOVBload [i] {s} p0 mem)
+    sh:(SHL(L|Q)const [8] x1:(MOVBload [i] {s} p1 mem)))
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
 
-(ORL                   x0:(MOVWload [i0] {s} p0 mem)
-    sh:(SHLLconst [16] x1:(MOVWload [i1] {s} p1 mem)))
+(OR(L|Q)                   x0:(MOVWload [i0] {s} p mem)
+    sh:(SHL(L|Q)const [16] x1:(MOVWload [i1] {s} p mem)))
   && i1 == i0+2
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
 
-(ORQ                   x0:(MOVWload [i0] {s} p0 mem)
-    sh:(SHLQconst [16] x1:(MOVWload [i1] {s} p1 mem)))
-  && i1 == i0+2
+(OR(L|Q)                   x0:(MOVWload [i] {s} p0 mem)
+    sh:(SHL(L|Q)const [16] x1:(MOVWload [i] {s} p1 mem)))
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 2)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
 
-(ORQ                   x0:(MOVLload [i0] {s} p0 mem)
-    sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p1 mem)))
+(ORQ                   x0:(MOVLload [i0] {s} p mem)
+    sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p mem)))
   && i1 == i0+4
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (MOVQload [i0] {s} p0 mem)
+  -> @mergePoint(b,x0,x1) (MOVQload [i0] {s} p mem)
 
-(ORL
-    s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p0 mem))
-    or:(ORL
-        s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p1 mem))
+(ORQ                   x0:(MOVLload [i] {s} p0 mem)
+    sh:(SHLQconst [32] x1:(MOVLload [i] {s} p1 mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, sh)
+  -> @mergePoint(b,x0,x1) (MOVQload [i] {s} p0 mem)
+
+(OR(L|Q)
+    s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
+    or:(OR(L|Q)
+        s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
        y))
   && i1 == i0+1
   && j1 == j0+8
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
 
-(ORQ
-    s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p0 mem))
-    or:(ORQ
-        s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p1 mem))
+(OR(L|Q)
+    s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
+    or:(OR(L|Q)
+        s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
        y))
-  && i1 == i0+1
   && j1 == j0+8
   && j0 % 16 == 0
   && x0.Uses == 1
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
 
 (ORQ
-    s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p0 mem))
+    s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p mem))
     or:(ORQ
-        s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p1 mem))
+        s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p mem))
        y))
   && i1 == i0+2
   && j1 == j0+16
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p0 mem)) y)
-
-// Little-endian indexed loads
-
-// Move constants offsets from LEAQx up into load. This lets the above combining
-// rules discover indexed load-combining instances.
-//TODO:remove! These rules are bad.
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-(MOV(B|W|L|Q)load [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)load [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
-(MOV(B|W|L|Q)store [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem) && i1 != 0 && is32Bit(i0+i1)
--> (MOV(B|W|L|Q)store [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p mem)) y)
+
+(ORQ
+    s1:(SHLQconst [j1] x1:(MOVWload [i] {s} p1 mem))
+    or:(ORQ
+        s0:(SHLQconst [j0] x0:(MOVWload [i] {s} p0 mem))
+       y))
+  && j1 == j0+16
+  && j0 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, s0, s1, or)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i] {s} p0 mem)) y)
 
 // Big-endian loads
 
-(ORL
-                       x1:(MOVBload [i1] {s} p0 mem)
-    sh:(SHLLconst [8]  x0:(MOVBload [i0] {s} p1 mem)))
+(OR(L|Q)
+                           x1:(MOVBload [i1] {s} p mem)
+    sh:(SHL(L|Q)const [8]  x0:(MOVBload [i0] {s} p mem)))
   && i1 == i0+1
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
 
-(ORQ
-                       x1:(MOVBload [i1] {s} p0 mem)
-    sh:(SHLQconst [8]  x0:(MOVBload [i0] {s} p1 mem)))
-  && i1 == i0+1
+(OR(L|Q)
+                           x1:(MOVBload [i] {s} p1 mem)
+    sh:(SHL(L|Q)const [8]  x0:(MOVBload [i] {s} p0 mem)))
   && x0.Uses == 1
   && x1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, sh)
-  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
 
-(ORL
-                        r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem))
-    sh:(SHLLconst [16]  r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
+(OR(L|Q)
+                            r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem))
+    sh:(SHL(L|Q)const [16]  r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
   && i1 == i0+2
   && x0.Uses == 1
   && x1.Uses == 1
   && r0.Uses == 1
   && r1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, r0, r1, sh)
-  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
 
-(ORQ
-                        r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem))
-    sh:(SHLQconst [16]  r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
-  && i1 == i0+2
+(OR(L|Q)
+                            r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem))
+    sh:(SHL(L|Q)const [16]  r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
   && x0.Uses == 1
   && x1.Uses == 1
   && r0.Uses == 1
   && r1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 2)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, r0, r1, sh)
-  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
 
 (ORQ
-                        r1:(BSWAPL x1:(MOVLload [i1] {s} p0 mem))
-    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i0] {s} p1 mem))))
+                        r1:(BSWAPL x1:(MOVLload [i1] {s} p mem))
+    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i0] {s} p mem))))
   && i1 == i0+4
   && x0.Uses == 1
   && x1.Uses == 1
   && r0.Uses == 1
   && r1.Uses == 1
   && sh.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1) != nil
   && clobber(x0, x1, r0, r1, sh)
-  -> @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p0 mem))
+  -> @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p mem))
+
+(ORQ
+                        r1:(BSWAPL x1:(MOVLload [i] {s} p1 mem))
+    sh:(SHLQconst [32]  r0:(BSWAPL x0:(MOVLload [i] {s} p0 mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && sh.Uses == 1
+  && sequentialAddresses(p0, p1, 4)
+  && mergePoint(b,x0,x1) != nil
+  && clobber(x0, x1, r0, r1, sh)
+  -> @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i] {s} p0 mem))
 
-(ORL
-    s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p0 mem))
-    or:(ORL
-        s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p1 mem))
+(OR(L|Q)
+    s0:(SHL(L|Q)const [j0] x0:(MOVBload [i0] {s} p mem))
+    or:(OR(L|Q)
+        s1:(SHL(L|Q)const [j1] x1:(MOVBload [i1] {s} p mem))
        y))
   && i1 == i0+1
   && j1 == j0-8
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
 
-(ORQ
-    s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p0 mem))
-    or:(ORQ
-        s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p1 mem))
+(OR(L|Q)
+    s0:(SHL(L|Q)const [j0] x0:(MOVBload [i] {s} p0 mem))
+    or:(OR(L|Q)
+        s1:(SHL(L|Q)const [j1] x1:(MOVBload [i] {s} p1 mem))
        y))
-  && i1 == i0+1
   && j1 == j0-8
   && j1 % 16 == 0
   && x0.Uses == 1
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+  -> @mergePoint(b,x0,x1,y) (OR(L|Q) <v.Type> (SHL(L|Q)const <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
 
 (ORQ
-    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p0 mem)))
+    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem)))
     or:(ORQ
-        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p1 mem)))
+        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)))
        y))
   && i1 == i0+2
   && j1 == j0-16
   && s0.Uses == 1
   && s1.Uses == 1
   && or.Uses == 1
-  && same(p0, p1, 1)
   && mergePoint(b,x0,x1,y) != nil
   && clobber(x0, x1, r0, r1, s0, s1, or)
-  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p0 mem))) y)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p mem))) y)
+
+(ORQ
+    s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem)))
+    or:(ORQ
+        s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)))
+       y))
+  && j1 == j0-16
+  && j1 % 32 == 0
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && r0.Uses == 1
+  && r1.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && or.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && mergePoint(b,x0,x1,y) != nil
+  && clobber(x0, x1, r0, r1, s0, s1, or)
+  -> @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i] {s} p0 mem))) y)
 
 // Combine 2 byte stores + shift into rolw 8 + word store
+(MOVBstore [i] {s} p w
+  x0:(MOVBstore [i-1] {s} p (SHRWconst [8] w) mem))
+  && x0.Uses == 1
+  && clobber(x0)
+  -> (MOVWstore [i-1] {s} p (ROLWconst <w.Type> [8] w) mem)
 (MOVBstore [i] {s} p1 w
-  x0:(MOVBstore [i-1] {s} p0 (SHRWconst [8] w) mem))
+  x0:(MOVBstore [i] {s} p0 (SHRWconst [8] w) mem))
   && x0.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && clobber(x0)
-  -> (MOVWstore [i-1] {s} p0 (ROLWconst <w.Type> [8] w) mem)
+  -> (MOVWstore [i] {s} p0 (ROLWconst <w.Type> [8] w) mem)
 
 // Combine stores + shifts into bswap and larger (unaligned) stores
+(MOVBstore [i] {s} p w
+  x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w)
+  x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w)
+  x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && clobber(x0, x1, x2)
+  -> (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
 (MOVBstore [i] {s} p3 w
-  x2:(MOVBstore [i-1] {s} p2 (SHRLconst [8] w)
-  x1:(MOVBstore [i-2] {s} p1 (SHRLconst [16] w)
-  x0:(MOVBstore [i-3] {s} p0 (SHRLconst [24] w) mem))))
+  x2:(MOVBstore [i] {s} p2 (SHRLconst [8] w)
+  x1:(MOVBstore [i] {s} p1 (SHRLconst [16] w)
+  x0:(MOVBstore [i] {s} p0 (SHRLconst [24] w) mem))))
   && x0.Uses == 1
   && x1.Uses == 1
   && x2.Uses == 1
-  && same(p0, p1, 1)
-  && same(p1, p2, 1)
-  && same(p2, p3, 1)
+  && sequentialAddresses(p0, p1, 1)
+  && sequentialAddresses(p1, p2, 1)
+  && sequentialAddresses(p2, p3, 1)
   && clobber(x0, x1, x2)
-  -> (MOVLstore [i-3] {s} p0 (BSWAPL <w.Type> w) mem)
-
+  -> (MOVLstore [i] {s} p0 (BSWAPL <w.Type> w) mem)
+
+(MOVBstore [i] {s} p w
+  x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w)
+  x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w)
+  x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w)
+  x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w)
+  x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w)
+  x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w)
+  x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && x4.Uses == 1
+  && x5.Uses == 1
+  && x6.Uses == 1
+  && clobber(x0, x1, x2, x3, x4, x5, x6)
+  -> (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
 (MOVBstore [i] {s} p7 w
-  x6:(MOVBstore [i-1] {s} p6 (SHRQconst [8] w)
-  x5:(MOVBstore [i-2] {s} p5 (SHRQconst [16] w)
-  x4:(MOVBstore [i-3] {s} p4 (SHRQconst [24] w)
-  x3:(MOVBstore [i-4] {s} p3 (SHRQconst [32] w)
-  x2:(MOVBstore [i-5] {s} p2 (SHRQconst [40] w)
-  x1:(MOVBstore [i-6] {s} p1 (SHRQconst [48] w)
-  x0:(MOVBstore [i-7] {s} p0 (SHRQconst [56] w) mem))))))))
+  x6:(MOVBstore [i] {s} p6 (SHRQconst [8] w)
+  x5:(MOVBstore [i] {s} p5 (SHRQconst [16] w)
+  x4:(MOVBstore [i] {s} p4 (SHRQconst [24] w)
+  x3:(MOVBstore [i] {s} p3 (SHRQconst [32] w)
+  x2:(MOVBstore [i] {s} p2 (SHRQconst [40] w)
+  x1:(MOVBstore [i] {s} p1 (SHRQconst [48] w)
+  x0:(MOVBstore [i] {s} p0 (SHRQconst [56] w) mem))))))))
   && x0.Uses == 1
   && x1.Uses == 1
   && x2.Uses == 1
   && x4.Uses == 1
   && x5.Uses == 1
   && x6.Uses == 1
-  && same(p0, p1, 1)
-  && same(p1, p2, 1)
-  && same(p2, p3, 1)
-  && same(p3, p4, 1)
-  && same(p4, p5, 1)
-  && same(p5, p6, 1)
-  && same(p6, p7, 1)
+  && sequentialAddresses(p0, p1, 1)
+  && sequentialAddresses(p1, p2, 1)
+  && sequentialAddresses(p2, p3, 1)
+  && sequentialAddresses(p3, p4, 1)
+  && sequentialAddresses(p4, p5, 1)
+  && sequentialAddresses(p5, p6, 1)
+  && sequentialAddresses(p6, p7, 1)
   && clobber(x0, x1, x2, x3, x4, x5, x6)
-  -> (MOVQstore [i-7] {s} p0 (BSWAPQ <w.Type> w) mem)
+  -> (MOVQstore [i] {s} p0 (BSWAPQ <w.Type> w) mem)
 
 // Combine constant stores into larger (unaligned) stores.
-(MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem))
+(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 1 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
-(MOVBstoreconst [a] {s} p1 x:(MOVBstoreconst [c] {s} p0 mem))
+  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
+(MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 1 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
-(MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem))
+  -> (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
+(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
-(MOVWstoreconst [a] {s} p1 x:(MOVWstoreconst [c] {s} p0 mem))
+  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
+(MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
-(MOVLstoreconst [c] {s} p1 x:(MOVLstoreconst [a] {s} p0 mem))
+  -> (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
+(MOVLstoreconst [c] {s} p x:(MOVLstoreconst [a] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 4 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
-(MOVLstoreconst [a] {s} p1 x:(MOVLstoreconst [c] {s} p0 mem))
+  -> (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+(MOVLstoreconst [a] {s} p x:(MOVLstoreconst [c] {s} p mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(a).Off() + 4 == ValAndOff(c).Off()
   && clobber(x)
-  -> (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
-(MOVQstoreconst [c] {s} p1 x:(MOVQstoreconst [c2] {s} p0 mem))
+  -> (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+(MOVQstoreconst [c] {s} p x:(MOVQstoreconst [c2] {s} p mem))
   && config.useSSE
   && x.Uses == 1
-  && same(p0, p1, 1)
   && ValAndOff(c2).Off() + 8 == ValAndOff(c).Off()
   && ValAndOff(c).Val() == 0
   && ValAndOff(c2).Val() == 0
   && clobber(x)
-  -> (MOVOstore [ValAndOff(c2).Off()] {s} p0 (MOVOconst [0]) mem)
+  -> (MOVOstore [ValAndOff(c2).Off()] {s} p (MOVOconst [0]) mem)
 
-// Combine stores into larger (unaligned) stores.
-(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
+// Combine stores into larger (unaligned) stores. Little endian.
+(MOVBstore [i] {s} p (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  -> (MOVWstore [i-1] {s} p w mem)
+(MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHR(W|L|Q)const [8] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && clobber(x)
-  -> (MOVWstore [i-1] {s} p0 w mem)
-(MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHR(W|L|Q)const [8] w) mem))
+  -> (MOVWstore [i] {s} p w mem)
+(MOVBstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p w0:(SHR(L|Q)const [j-8] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  -> (MOVWstore [i-1] {s} p w0 mem)
+(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) x:(MOVBstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 1)
+  && clobber(x)
+  -> (MOVWstore [i] {s} p0 w mem)
+(MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHR(W|L|Q)const [8] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && clobber(x)
   -> (MOVWstore [i] {s} p0 w mem)
-(MOVBstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p0 w0:(SHR(L|Q)const [j-8] w) mem))
+(MOVBstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVBstore [i] {s} p0 w0:(SHR(L|Q)const [j-8] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 1)
   && clobber(x)
-  -> (MOVWstore [i-1] {s} p0 w0 mem)
-(MOVWstore [i] {s} p1 (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p0 w mem))
+  -> (MOVWstore [i] {s} p0 w0 mem)
+
+(MOVWstore [i] {s} p (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p w mem))
+  && x.Uses == 1
+  && clobber(x)
+  -> (MOVLstore [i-2] {s} p w mem)
+(MOVWstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p w0:(SHR(L|Q)const [j-16] w) mem))
+  && x.Uses == 1
+  && clobber(x)
+  -> (MOVLstore [i-2] {s} p w0 mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [16] w) x:(MOVWstore [i] {s} p0 w mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  -> (MOVLstore [i] {s} p0 w mem)
+(MOVWstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVWstore [i] {s} p0 w0:(SHR(L|Q)const [j-16] w) mem))
+  && x.Uses == 1
+  && sequentialAddresses(p0, p1, 2)
+  && clobber(x)
+  -> (MOVLstore [i] {s} p0 w0 mem)
+
+(MOVLstore [i] {s} p (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p w mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && clobber(x)
-  -> (MOVLstore [i-2] {s} p0 w mem)
-(MOVWstore [i] {s} p1 (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p0 w0:(SHR(L|Q)const [j-16] w) mem))
+  -> (MOVQstore [i-4] {s} p w mem)
+(MOVLstore [i] {s} p (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p w0:(SHRQconst [j-32] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
   && clobber(x)
-  -> (MOVLstore [i-2] {s} p0 w0 mem)
-(MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p0 w mem))
+  -> (MOVQstore [i-4] {s} p w0 mem)
+(MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i] {s} p0 w mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 4)
   && clobber(x)
-  -> (MOVQstore [i-4] {s} p0 w mem)
-(MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p0 w0:(SHRQconst [j-32] w) mem))
+  -> (MOVQstore [i] {s} p0 w mem)
+(MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i] {s} p0 w0:(SHRQconst [j-32] w) mem))
   && x.Uses == 1
-  && same(p0, p1, 1)
+  && sequentialAddresses(p0, p1, 4)
   && clobber(x)
-  -> (MOVQstore [i-4] {s} p0 w0 mem)
+  -> (MOVQstore [i] {s} p0 w0 mem)
 
 (MOVBstore [i] {s} p
   x1:(MOVBload [j] {s2} p2 mem)
index fc03f0d72c8ab00f4c71d7a2e5835eeef2bdec3d..878b15eeee9db1131ffbc994c3165f83b6fba294 100644 (file)
@@ -1260,46 +1260,15 @@ func sequentialAddresses(x, y *Value, n int64) bool {
                        x.Args[0] == y.Args[1] && x.Args[1] == y.Args[0]) {
                return true
        }
-       return false
-}
-
-// same reports whether x and y are the same value.
-// It checks to a maximum depth of d, so it may report
-// a false negative.
-// TODO: remove when amd64 port is switched to using sequentialAddresses
-func same(x, y *Value, depth int) bool {
-       if x == y {
+       if x.Op == OpAMD64ADDQ && y.Op == OpAMD64LEAQ1 && y.AuxInt == n && y.Aux == nil &&
+               (x.Args[0] == y.Args[0] && x.Args[1] == y.Args[1] ||
+                       x.Args[0] == y.Args[1] && x.Args[1] == y.Args[0]) {
                return true
        }
-       if depth <= 0 {
-               return false
-       }
-       if x.Op != y.Op || x.Aux != y.Aux || x.AuxInt != y.AuxInt {
-               return false
-       }
-       if len(x.Args) != len(y.Args) {
-               return false
-       }
-       if opcodeTable[x.Op].commutative {
-               // Check exchanged ordering first.
-               for i, a := range x.Args {
-                       j := i
-                       if j < 2 {
-                               j ^= 1
-                       }
-                       b := y.Args[j]
-                       if !same(a, b, depth-1) {
-                               goto checkNormalOrder
-                       }
-               }
+       if x.Op == OpAMD64LEAQ1 && y.Op == OpAMD64LEAQ1 && y.AuxInt == x.AuxInt+n && x.Aux == y.Aux &&
+               (x.Args[0] == y.Args[0] && x.Args[1] == y.Args[1] ||
+                       x.Args[0] == y.Args[1] && x.Args[1] == y.Args[0]) {
                return true
-       checkNormalOrder:
        }
-       for i, a := range x.Args {
-               b := y.Args[i]
-               if !same(a, b, depth-1) {
-                       return false
-               }
-       }
-       return true
+       return false
 }
index b9a401cca93183f64c6ef283ae731283f58febc8..e4d86485d4fc6d9ef14c49882dfc131551555124 100644 (file)
@@ -10140,7 +10140,6 @@ func rewriteValueAMD64_OpAMD64MOVBatomicload(v *Value) bool {
 func rewriteValueAMD64_OpAMD64MOVBload(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       b := v.Block
        // match: (MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _))
        // cond: sym == sym2 && off == off2 && isSamePtr(ptr, ptr2)
        // result: (MOVBQZX x)
@@ -10205,118 +10204,6 @@ func rewriteValueAMD64_OpAMD64MOVBload(v *Value) bool {
                v.AddArg2(base, mem)
                return true
        }
-       // match: (MOVBload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVBload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVBload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVBload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVBload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVBload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVBload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVBload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
        // match: (MOVBload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
        // cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
        // result: (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -10722,159 +10609,124 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                v.AddArg3(base, val, mem)
                return true
        }
-       // match: (MOVBstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
+       // match: (MOVBstore [i] {s} p w x0:(MOVBstore [i-1] {s} p (SHRWconst [8] w) mem))
+       // cond: x0.Uses == 1 && clobber(x0)
+       // result: (MOVWstore [i-1] {s} p (ROLWconst <w.Type> [8] w) mem)
        for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               w := v_1
+               x0 := v_2
+               if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i-1 || x0.Aux != s {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               mem := x0.Args[2]
+               if p != x0.Args[0] {
                        break
                }
-               v.reset(OpAMD64MOVBstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
+               x0_1 := x0.Args[1]
+               if x0_1.Op != OpAMD64SHRWconst || x0_1.AuxInt != 8 || w != x0_1.Args[0] || !(x0.Uses == 1 && clobber(x0)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i - 1
+               v.Aux = s
+               v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, w.Type)
+               v0.AuxInt = 8
+               v0.AddArg(w)
+               v.AddArg3(p, v0, mem)
                return true
        }
-       // match: (MOVBstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
+       // match: (MOVBstore [i] {s} p1 w x0:(MOVBstore [i] {s} p0 (SHRWconst [8] w) mem))
+       // cond: x0.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x0)
+       // result: (MOVWstore [i] {s} p0 (ROLWconst <w.Type> [8] w) mem)
        for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
+               i := v.AuxInt
+               s := v.Aux
+               p1 := v_0
+               w := v_1
+               x0 := v_2
+               if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i || x0.Aux != s {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               mem := x0.Args[2]
+               p0 := x0.Args[0]
+               x0_1 := x0.Args[1]
+               if x0_1.Op != OpAMD64SHRWconst || x0_1.AuxInt != 8 || w != x0_1.Args[0] || !(x0.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x0)) {
                        break
                }
-               v.reset(OpAMD64MOVBstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i
+               v.Aux = s
+               v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, w.Type)
+               v0.AuxInt = 8
+               v0.AddArg(w)
+               v.AddArg3(p0, v0, mem)
                return true
        }
-       // match: (MOVBstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
+       // match: (MOVBstore [i] {s} p w x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w) x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w) x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && clobber(x0, x1, x2)
+       // result: (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
        for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               w := v_1
+               x2 := v_2
+               if x2.Op != OpAMD64MOVBstore || x2.AuxInt != i-1 || x2.Aux != s {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               _ = x2.Args[2]
+               if p != x2.Args[0] {
                        break
                }
-               v.reset(OpAMD64MOVBstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVBstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVBstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
+               x2_1 := x2.Args[1]
+               if x2_1.Op != OpAMD64SHRLconst || x2_1.AuxInt != 8 || w != x2_1.Args[0] {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               x1 := x2.Args[2]
+               if x1.Op != OpAMD64MOVBstore || x1.AuxInt != i-2 || x1.Aux != s {
                        break
                }
-               v.reset(OpAMD64MOVBstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVBstore [i] {s} p1 w x0:(MOVBstore [i-1] {s} p0 (SHRWconst [8] w) mem))
-       // cond: x0.Uses == 1 && same(p0, p1, 1) && clobber(x0)
-       // result: (MOVWstore [i-1] {s} p0 (ROLWconst <w.Type> [8] w) mem)
-       for {
-               i := v.AuxInt
-               s := v.Aux
-               p1 := v_0
-               w := v_1
-               x0 := v_2
-               if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i-1 || x0.Aux != s {
+               _ = x1.Args[2]
+               if p != x1.Args[0] {
+                       break
+               }
+               x1_1 := x1.Args[1]
+               if x1_1.Op != OpAMD64SHRLconst || x1_1.AuxInt != 16 || w != x1_1.Args[0] {
+                       break
+               }
+               x0 := x1.Args[2]
+               if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i-3 || x0.Aux != s {
                        break
                }
                mem := x0.Args[2]
-               p0 := x0.Args[0]
+               if p != x0.Args[0] {
+                       break
+               }
                x0_1 := x0.Args[1]
-               if x0_1.Op != OpAMD64SHRWconst || x0_1.AuxInt != 8 || w != x0_1.Args[0] || !(x0.Uses == 1 && same(p0, p1, 1) && clobber(x0)) {
+               if x0_1.Op != OpAMD64SHRLconst || x0_1.AuxInt != 24 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && clobber(x0, x1, x2)) {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i - 1
+               v.reset(OpAMD64MOVLstore)
+               v.AuxInt = i - 3
                v.Aux = s
-               v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, w.Type)
-               v0.AuxInt = 8
+               v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, w.Type)
                v0.AddArg(w)
-               v.AddArg3(p0, v0, mem)
+               v.AddArg3(p, v0, mem)
                return true
        }
-       // match: (MOVBstore [i] {s} p3 w x2:(MOVBstore [i-1] {s} p2 (SHRLconst [8] w) x1:(MOVBstore [i-2] {s} p1 (SHRLconst [16] w) x0:(MOVBstore [i-3] {s} p0 (SHRLconst [24] w) mem))))
-       // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && clobber(x0, x1, x2)
-       // result: (MOVLstore [i-3] {s} p0 (BSWAPL <w.Type> w) mem)
+       // match: (MOVBstore [i] {s} p3 w x2:(MOVBstore [i] {s} p2 (SHRLconst [8] w) x1:(MOVBstore [i] {s} p1 (SHRLconst [16] w) x0:(MOVBstore [i] {s} p0 (SHRLconst [24] w) mem))))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && clobber(x0, x1, x2)
+       // result: (MOVLstore [i] {s} p0 (BSWAPL <w.Type> w) mem)
        for {
                i := v.AuxInt
                s := v.Aux
                p3 := v_0
                w := v_1
                x2 := v_2
-               if x2.Op != OpAMD64MOVBstore || x2.AuxInt != i-1 || x2.Aux != s {
+               if x2.Op != OpAMD64MOVBstore || x2.AuxInt != i || x2.Aux != s {
                        break
                }
                _ = x2.Args[2]
@@ -10884,7 +10736,7 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                x1 := x2.Args[2]
-               if x1.Op != OpAMD64MOVBstore || x1.AuxInt != i-2 || x1.Aux != s {
+               if x1.Op != OpAMD64MOVBstore || x1.AuxInt != i || x1.Aux != s {
                        break
                }
                _ = x1.Args[2]
@@ -10894,37 +10746,39 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                x0 := x1.Args[2]
-               if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i-3 || x0.Aux != s {
+               if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i || x0.Aux != s {
                        break
                }
                mem := x0.Args[2]
                p0 := x0.Args[0]
                x0_1 := x0.Args[1]
-               if x0_1.Op != OpAMD64SHRLconst || x0_1.AuxInt != 24 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && clobber(x0, x1, x2)) {
+               if x0_1.Op != OpAMD64SHRLconst || x0_1.AuxInt != 24 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && clobber(x0, x1, x2)) {
                        break
                }
                v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i - 3
+               v.AuxInt = i
                v.Aux = s
                v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, w.Type)
                v0.AddArg(w)
                v.AddArg3(p0, v0, mem)
                return true
        }
-       // match: (MOVBstore [i] {s} p7 w x6:(MOVBstore [i-1] {s} p6 (SHRQconst [8] w) x5:(MOVBstore [i-2] {s} p5 (SHRQconst [16] w) x4:(MOVBstore [i-3] {s} p4 (SHRQconst [24] w) x3:(MOVBstore [i-4] {s} p3 (SHRQconst [32] w) x2:(MOVBstore [i-5] {s} p2 (SHRQconst [40] w) x1:(MOVBstore [i-6] {s} p1 (SHRQconst [48] w) x0:(MOVBstore [i-7] {s} p0 (SHRQconst [56] w) mem))))))))
-       // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && same(p3, p4, 1) && same(p4, p5, 1) && same(p5, p6, 1) && same(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)
-       // result: (MOVQstore [i-7] {s} p0 (BSWAPQ <w.Type> w) mem)
+       // match: (MOVBstore [i] {s} p w x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w) x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w) x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w) x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w) x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w) x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w) x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && clobber(x0, x1, x2, x3, x4, x5, x6)
+       // result: (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
        for {
                i := v.AuxInt
                s := v.Aux
-               p7 := v_0
+               p := v_0
                w := v_1
                x6 := v_2
                if x6.Op != OpAMD64MOVBstore || x6.AuxInt != i-1 || x6.Aux != s {
                        break
                }
                _ = x6.Args[2]
-               p6 := x6.Args[0]
+               if p != x6.Args[0] {
+                       break
+               }
                x6_1 := x6.Args[1]
                if x6_1.Op != OpAMD64SHRQconst || x6_1.AuxInt != 8 || w != x6_1.Args[0] {
                        break
@@ -10934,7 +10788,9 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                _ = x5.Args[2]
-               p5 := x5.Args[0]
+               if p != x5.Args[0] {
+                       break
+               }
                x5_1 := x5.Args[1]
                if x5_1.Op != OpAMD64SHRQconst || x5_1.AuxInt != 16 || w != x5_1.Args[0] {
                        break
@@ -10944,7 +10800,9 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                _ = x4.Args[2]
-               p4 := x4.Args[0]
+               if p != x4.Args[0] {
+                       break
+               }
                x4_1 := x4.Args[1]
                if x4_1.Op != OpAMD64SHRQconst || x4_1.AuxInt != 24 || w != x4_1.Args[0] {
                        break
@@ -10954,7 +10812,9 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                _ = x3.Args[2]
-               p3 := x3.Args[0]
+               if p != x3.Args[0] {
+                       break
+               }
                x3_1 := x3.Args[1]
                if x3_1.Op != OpAMD64SHRQconst || x3_1.AuxInt != 32 || w != x3_1.Args[0] {
                        break
@@ -10964,7 +10824,9 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                _ = x2.Args[2]
-               p2 := x2.Args[0]
+               if p != x2.Args[0] {
+                       break
+               }
                x2_1 := x2.Args[1]
                if x2_1.Op != OpAMD64SHRQconst || x2_1.AuxInt != 40 || w != x2_1.Args[0] {
                        break
@@ -10974,7 +10836,9 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                _ = x1.Args[2]
-               p1 := x1.Args[0]
+               if p != x1.Args[0] {
+                       break
+               }
                x1_1 := x1.Args[1]
                if x1_1.Op != OpAMD64SHRQconst || x1_1.AuxInt != 48 || w != x1_1.Args[0] {
                        break
@@ -10984,9 +10848,11 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                        break
                }
                mem := x0.Args[2]
-               p0 := x0.Args[0]
+               if p != x0.Args[0] {
+                       break
+               }
                x0_1 := x0.Args[1]
-               if x0_1.Op != OpAMD64SHRQconst || x0_1.AuxInt != 56 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && same(p0, p1, 1) && same(p1, p2, 1) && same(p2, p3, 1) && same(p3, p4, 1) && same(p4, p5, 1) && same(p5, p6, 1) && same(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)) {
+               if x0_1.Op != OpAMD64SHRQconst || x0_1.AuxInt != 56 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && clobber(x0, x1, x2, x3, x4, x5, x6)) {
                        break
                }
                v.reset(OpAMD64MOVQstore)
@@ -10994,103 +10860,326 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                v.Aux = s
                v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPQ, w.Type)
                v0.AddArg(w)
-               v.AddArg3(p0, v0, mem)
+               v.AddArg3(p, v0, mem)
                return true
        }
-       // match: (MOVBstore [i] {s} p1 (SHRWconst [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVWstore [i-1] {s} p0 w mem)
+       // match: (MOVBstore [i] {s} p7 w x6:(MOVBstore [i] {s} p6 (SHRQconst [8] w) x5:(MOVBstore [i] {s} p5 (SHRQconst [16] w) x4:(MOVBstore [i] {s} p4 (SHRQconst [24] w) x3:(MOVBstore [i] {s} p3 (SHRQconst [32] w) x2:(MOVBstore [i] {s} p2 (SHRQconst [40] w) x1:(MOVBstore [i] {s} p1 (SHRQconst [48] w) x0:(MOVBstore [i] {s} p0 (SHRQconst [56] w) mem))))))))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && sequentialAddresses(p3, p4, 1) && sequentialAddresses(p4, p5, 1) && sequentialAddresses(p5, p6, 1) && sequentialAddresses(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)
+       // result: (MOVQstore [i] {s} p0 (BSWAPQ <w.Type> w) mem)
        for {
                i := v.AuxInt
                s := v.Aux
-               p1 := v_0
-               if v_1.Op != OpAMD64SHRWconst || v_1.AuxInt != 8 {
+               p7 := v_0
+               w := v_1
+               x6 := v_2
+               if x6.Op != OpAMD64MOVBstore || x6.AuxInt != i || x6.Aux != s {
                        break
                }
-               w := v_1.Args[0]
-               x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+               _ = x6.Args[2]
+               p6 := x6.Args[0]
+               x6_1 := x6.Args[1]
+               if x6_1.Op != OpAMD64SHRQconst || x6_1.AuxInt != 8 || w != x6_1.Args[0] {
                        break
                }
-               mem := x.Args[2]
-               p0 := x.Args[0]
-               if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               x5 := x6.Args[2]
+               if x5.Op != OpAMD64MOVBstore || x5.AuxInt != i || x5.Aux != s {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i - 1
-               v.Aux = s
-               v.AddArg3(p0, w, mem)
-               return true
-       }
-       // match: (MOVBstore [i] {s} p1 (SHRLconst [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVWstore [i-1] {s} p0 w mem)
-       for {
-               i := v.AuxInt
-               s := v.Aux
-               p1 := v_0
-               if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 8 {
+               _ = x5.Args[2]
+               p5 := x5.Args[0]
+               x5_1 := x5.Args[1]
+               if x5_1.Op != OpAMD64SHRQconst || x5_1.AuxInt != 16 || w != x5_1.Args[0] {
                        break
                }
-               w := v_1.Args[0]
-               x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+               x4 := x5.Args[2]
+               if x4.Op != OpAMD64MOVBstore || x4.AuxInt != i || x4.Aux != s {
                        break
                }
-               mem := x.Args[2]
-               p0 := x.Args[0]
-               if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               _ = x4.Args[2]
+               p4 := x4.Args[0]
+               x4_1 := x4.Args[1]
+               if x4_1.Op != OpAMD64SHRQconst || x4_1.AuxInt != 24 || w != x4_1.Args[0] {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i - 1
-               v.Aux = s
-               v.AddArg3(p0, w, mem)
-               return true
-       }
-       // match: (MOVBstore [i] {s} p1 (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p0 w mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVWstore [i-1] {s} p0 w mem)
-       for {
-               i := v.AuxInt
-               s := v.Aux
-               p1 := v_0
-               if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 8 {
+               x3 := x4.Args[2]
+               if x3.Op != OpAMD64MOVBstore || x3.AuxInt != i || x3.Aux != s {
                        break
                }
-               w := v_1.Args[0]
-               x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+               _ = x3.Args[2]
+               p3 := x3.Args[0]
+               x3_1 := x3.Args[1]
+               if x3_1.Op != OpAMD64SHRQconst || x3_1.AuxInt != 32 || w != x3_1.Args[0] {
                        break
                }
-               mem := x.Args[2]
-               p0 := x.Args[0]
-               if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               x2 := x3.Args[2]
+               if x2.Op != OpAMD64MOVBstore || x2.AuxInt != i || x2.Aux != s {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i - 1
-               v.Aux = s
-               v.AddArg3(p0, w, mem)
-               return true
-       }
-       // match: (MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHRWconst [8] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVWstore [i] {s} p0 w mem)
-       for {
-               i := v.AuxInt
-               s := v.Aux
-               p1 := v_0
-               w := v_1
-               x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
-                       break
+               _ = x2.Args[2]
+               p2 := x2.Args[0]
+               x2_1 := x2.Args[1]
+               if x2_1.Op != OpAMD64SHRQconst || x2_1.AuxInt != 40 || w != x2_1.Args[0] {
+                       break
+               }
+               x1 := x2.Args[2]
+               if x1.Op != OpAMD64MOVBstore || x1.AuxInt != i || x1.Aux != s {
+                       break
+               }
+               _ = x1.Args[2]
+               p1 := x1.Args[0]
+               x1_1 := x1.Args[1]
+               if x1_1.Op != OpAMD64SHRQconst || x1_1.AuxInt != 48 || w != x1_1.Args[0] {
+                       break
+               }
+               x0 := x1.Args[2]
+               if x0.Op != OpAMD64MOVBstore || x0.AuxInt != i || x0.Aux != s {
+                       break
+               }
+               mem := x0.Args[2]
+               p0 := x0.Args[0]
+               x0_1 := x0.Args[1]
+               if x0_1.Op != OpAMD64SHRQconst || x0_1.AuxInt != 56 || w != x0_1.Args[0] || !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && sequentialAddresses(p0, p1, 1) && sequentialAddresses(p1, p2, 1) && sequentialAddresses(p2, p3, 1) && sequentialAddresses(p3, p4, 1) && sequentialAddresses(p4, p5, 1) && sequentialAddresses(p5, p6, 1) && sequentialAddresses(p6, p7, 1) && clobber(x0, x1, x2, x3, x4, x5, x6)) {
+                       break
+               }
+               v.reset(OpAMD64MOVQstore)
+               v.AuxInt = i
+               v.Aux = s
+               v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPQ, w.Type)
+               v0.AddArg(w)
+               v.AddArg3(p0, v0, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p (SHRWconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i-1] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRWconst || v_1.AuxInt != 8 {
+                       break
+               }
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+                       break
                }
                mem := x.Args[2]
-               p0 := x.Args[0]
+               if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i - 1
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p (SHRLconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i-1] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 8 {
+                       break
+               }
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i - 1
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i-1] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 8 {
+                       break
+               }
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i - 1
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHRWconst [8] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               w := v_1
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] {
+                       break
+               }
+               x_1 := x.Args[1]
+               if x_1.Op != OpAMD64SHRWconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHRLconst [8] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               w := v_1
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] {
+                       break
+               }
+               x_1 := x.Args[1]
+               if x_1.Op != OpAMD64SHRLconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p w x:(MOVBstore [i+1] {s} p (SHRQconst [8] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               w := v_1
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] {
+                       break
+               }
                x_1 := x.Args[1]
-               if x_1.Op != OpAMD64SHRWconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if x_1.Op != OpAMD64SHRQconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p (SHRLconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRLconst [j-8] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i-1] {s} p w0 mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRLconst {
+                       break
+               }
+               j := v_1.AuxInt
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] {
+                       break
+               }
+               w0 := x.Args[1]
+               if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i - 1
+               v.Aux = s
+               v.AddArg3(p, w0, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p (SHRQconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRQconst [j-8] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVWstore [i-1] {s} p w0 mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               j := v_1.AuxInt
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] {
+                       break
+               }
+               w0 := x.Args[1]
+               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i - 1
+               v.Aux = s
+               v.AddArg3(p, w0, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p1 (SHRWconst [8] w) x:(MOVBstore [i] {s} p0 w mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+       // result: (MOVWstore [i] {s} p0 w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p1 := v_0
+               if v_1.Op != OpAMD64SHRWconst || v_1.AuxInt != 8 {
+                       break
+               }
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               p0 := x.Args[0]
+               if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVWstore)
@@ -11099,22 +11188,24 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                v.AddArg3(p0, w, mem)
                return true
        }
-       // match: (MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHRLconst [8] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
+       // match: (MOVBstore [i] {s} p1 (SHRLconst [8] w) x:(MOVBstore [i] {s} p0 w mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
        // result: (MOVWstore [i] {s} p0 w mem)
        for {
                i := v.AuxInt
                s := v.Aux
                p1 := v_0
-               w := v_1
+               if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 8 {
+                       break
+               }
+               w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
-               x_1 := x.Args[1]
-               if x_1.Op != OpAMD64SHRLconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVWstore)
@@ -11123,22 +11214,48 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                v.AddArg3(p0, w, mem)
                return true
        }
-       // match: (MOVBstore [i] {s} p1 w x:(MOVBstore [i+1] {s} p0 (SHRQconst [8] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
+       // match: (MOVBstore [i] {s} p1 (SHRQconst [8] w) x:(MOVBstore [i] {s} p0 w mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
        // result: (MOVWstore [i] {s} p0 w mem)
        for {
                i := v.AuxInt
                s := v.Aux
                p1 := v_0
-               w := v_1
+               if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 8 {
+                       break
+               }
+               w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i+1 || x.Aux != s {
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
+               if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i
+               v.Aux = s
+               v.AddArg3(p0, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHRWconst [8] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+       // result: (MOVWstore [i] {s} p0 w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p0 := v_0
+               w := v_1
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               p1 := x.Args[0]
                x_1 := x.Args[1]
-               if x_1.Op != OpAMD64SHRQconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if x_1.Op != OpAMD64SHRWconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVWstore)
@@ -11147,9 +11264,57 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                v.AddArg3(p0, w, mem)
                return true
        }
-       // match: (MOVBstore [i] {s} p1 (SHRLconst [j] w) x:(MOVBstore [i-1] {s} p0 w0:(SHRLconst [j-8] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVWstore [i-1] {s} p0 w0 mem)
+       // match: (MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHRLconst [8] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+       // result: (MOVWstore [i] {s} p0 w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p0 := v_0
+               w := v_1
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               p1 := x.Args[0]
+               x_1 := x.Args[1]
+               if x_1.Op != OpAMD64SHRLconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i
+               v.Aux = s
+               v.AddArg3(p0, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p0 w x:(MOVBstore [i] {s} p1 (SHRQconst [8] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+       // result: (MOVWstore [i] {s} p0 w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p0 := v_0
+               w := v_1
+               x := v_2
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               p1 := x.Args[0]
+               x_1 := x.Args[1]
+               if x_1.Op != OpAMD64SHRQconst || x_1.AuxInt != 8 || w != x_1.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i
+               v.Aux = s
+               v.AddArg3(p0, w, mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p1 (SHRLconst [j] w) x:(MOVBstore [i] {s} p0 w0:(SHRLconst [j-8] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+       // result: (MOVWstore [i] {s} p0 w0 mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -11160,24 +11325,24 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                j := v_1.AuxInt
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
                w0 := x.Args[1]
-               if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i - 1
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w0, mem)
                return true
        }
-       // match: (MOVBstore [i] {s} p1 (SHRQconst [j] w) x:(MOVBstore [i-1] {s} p0 w0:(SHRQconst [j-8] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVWstore [i-1] {s} p0 w0 mem)
+       // match: (MOVBstore [i] {s} p1 (SHRQconst [j] w) x:(MOVBstore [i] {s} p0 w0:(SHRQconst [j-8] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)
+       // result: (MOVWstore [i] {s} p0 w0 mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -11188,17 +11353,17 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool {
                j := v_1.AuxInt
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVBstore || x.AuxInt != i-1 || x.Aux != s {
+               if x.Op != OpAMD64MOVBstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
                w0 := x.Args[1]
-               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-8 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 1) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i - 1
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w0, mem)
                return true
@@ -11337,13 +11502,13 @@ func rewriteValueAMD64_OpAMD64MOVBstoreconst(v *Value) bool {
                v.AddArg2(ptr, mem)
                return true
        }
-       // match: (MOVBstoreconst [c] {s} p1 x:(MOVBstoreconst [a] {s} p0 mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
-       // result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
+       // match: (MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
+       // cond: x.Uses == 1 && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
+       // result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
        for {
                c := v.AuxInt
                s := v.Aux
-               p1 := v_0
+               p := v_0
                x := v_1
                if x.Op != OpAMD64MOVBstoreconst {
                        break
@@ -11353,23 +11518,22 @@ func rewriteValueAMD64_OpAMD64MOVBstoreconst(v *Value) bool {
                        break
                }
                mem := x.Args[1]
-               p0 := x.Args[0]
-               if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
+               if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVWstoreconst)
                v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xff|ValAndOff(c).Val()<<8, ValAndOff(a).Off())
                v.Aux = s
-               v.AddArg2(p0, mem)
+               v.AddArg2(p, mem)
                return true
        }
-       // match: (MOVBstoreconst [a] {s} p1 x:(MOVBstoreconst [c] {s} p0 mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
-       // result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p0 mem)
+       // match: (MOVBstoreconst [a] {s} p x:(MOVBstoreconst [c] {s} p mem))
+       // cond: x.Uses == 1 && ValAndOff(a).Off() + 1 == ValAndOff(c).Off() && clobber(x)
+       // result: (MOVWstoreconst [makeValAndOff(ValAndOff(a).Val()&0xff | ValAndOff(c).Val()<<8, ValAndOff(a).Off())] {s} p mem)
        for {
                a := v.AuxInt
                s := v.Aux
-               p1 := v_0
+               p := v_0
                x := v_1
                if x.Op != OpAMD64MOVBstoreconst {
                        break
@@ -11379,14 +11543,13 @@ func rewriteValueAMD64_OpAMD64MOVBstoreconst(v *Value) bool {
                        break
                }
                mem := x.Args[1]
-               p0 := x.Args[0]
-               if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
+               if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+1 == ValAndOff(c).Off() && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVWstoreconst)
                v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xff|ValAndOff(c).Val()<<8, ValAndOff(a).Off())
                v.Aux = s
-               v.AddArg2(p0, mem)
+               v.AddArg2(p, mem)
                return true
        }
        // match: (MOVBstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -11860,118 +12023,6 @@ func rewriteValueAMD64_OpAMD64MOVLload(v *Value) bool {
                v.AddArg2(base, mem)
                return true
        }
-       // match: (MOVLload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVLload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVLload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVLload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVLload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVLload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVLload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVLload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
        // match: (MOVLload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
        // cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
        // result: (MOVLload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -12174,125 +12225,64 @@ func rewriteValueAMD64_OpAMD64MOVLstore(v *Value) bool {
                v.AddArg3(base, val, mem)
                return true
        }
-       // match: (MOVLstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVLstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
+       // match: (MOVLstore [i] {s} p (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p w mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVQstore [i-4] {s} p w mem)
        for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 32 {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVLstore || x.AuxInt != i-4 || x.Aux != s {
                        break
                }
-               v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
+               mem := x.Args[2]
+               if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVQstore)
+               v.AuxInt = i - 4
+               v.Aux = s
+               v.AddArg3(p, w, mem)
                return true
        }
-       // match: (MOVLstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
+       // match: (MOVLstore [i] {s} p (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p w0:(SHRQconst [j-32] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVQstore [i-4] {s} p w0 mem)
        for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRQconst {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               j := v_1.AuxInt
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVLstore || x.AuxInt != i-4 || x.Aux != s {
                        break
                }
-               v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVLstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVLstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
+               mem := x.Args[2]
+               if p != x.Args[0] {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               w0 := x.Args[1]
+               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-32 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
                        break
                }
-               v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
+               v.reset(OpAMD64MOVQstore)
+               v.AuxInt = i - 4
+               v.Aux = s
+               v.AddArg3(p, w0, mem)
                return true
        }
-       // match: (MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i-4] {s} p0 w mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVQstore [i-4] {s} p0 w mem)
+       // match: (MOVLstore [i] {s} p1 (SHRQconst [32] w) x:(MOVLstore [i] {s} p0 w mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)
+       // result: (MOVQstore [i] {s} p0 w mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -12302,23 +12292,23 @@ func rewriteValueAMD64_OpAMD64MOVLstore(v *Value) bool {
                }
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVLstore || x.AuxInt != i-4 || x.Aux != s {
+               if x.Op != OpAMD64MOVLstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
-               if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVQstore)
-               v.AuxInt = i - 4
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w, mem)
                return true
        }
-       // match: (MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i-4] {s} p0 w0:(SHRQconst [j-32] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVQstore [i-4] {s} p0 w0 mem)
+       // match: (MOVLstore [i] {s} p1 (SHRQconst [j] w) x:(MOVLstore [i] {s} p0 w0:(SHRQconst [j-32] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)
+       // result: (MOVQstore [i] {s} p0 w0 mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -12329,17 +12319,17 @@ func rewriteValueAMD64_OpAMD64MOVLstore(v *Value) bool {
                j := v_1.AuxInt
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVLstore || x.AuxInt != i-4 || x.Aux != s {
+               if x.Op != OpAMD64MOVLstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
                w0 := x.Args[1]
-               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-32 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-32 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 4) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVQstore)
-               v.AuxInt = i - 4
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w0, mem)
                return true
@@ -13018,13 +13008,13 @@ func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
                v.AddArg2(ptr, mem)
                return true
        }
-       // match: (MOVLstoreconst [c] {s} p1 x:(MOVLstoreconst [a] {s} p0 mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
-       // result: (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+       // match: (MOVLstoreconst [c] {s} p x:(MOVLstoreconst [a] {s} p mem))
+       // cond: x.Uses == 1 && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
+       // result: (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
        for {
                c := v.AuxInt
                s := v.Aux
-               p1 := v_0
+               p := v_0
                x := v_1
                if x.Op != OpAMD64MOVLstoreconst {
                        break
@@ -13034,8 +13024,7 @@ func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
                        break
                }
                mem := x.Args[1]
-               p0 := x.Args[0]
-               if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
+               if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVQstore)
@@ -13043,16 +13032,16 @@ func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
                v.Aux = s
                v0 := b.NewValue0(x.Pos, OpAMD64MOVQconst, typ.UInt64)
                v0.AuxInt = ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32
-               v.AddArg3(p0, v0, mem)
+               v.AddArg3(p, v0, mem)
                return true
        }
-       // match: (MOVLstoreconst [a] {s} p1 x:(MOVLstoreconst [c] {s} p0 mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
-       // result: (MOVQstore [ValAndOff(a).Off()] {s} p0 (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
+       // match: (MOVLstoreconst [a] {s} p x:(MOVLstoreconst [c] {s} p mem))
+       // cond: x.Uses == 1 && ValAndOff(a).Off() + 4 == ValAndOff(c).Off() && clobber(x)
+       // result: (MOVQstore [ValAndOff(a).Off()] {s} p (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
        for {
                a := v.AuxInt
                s := v.Aux
-               p1 := v_0
+               p := v_0
                x := v_1
                if x.Op != OpAMD64MOVLstoreconst {
                        break
@@ -13062,8 +13051,7 @@ func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
                        break
                }
                mem := x.Args[1]
-               p0 := x.Args[0]
-               if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
+               if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+4 == ValAndOff(c).Off() && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVQstore)
@@ -13071,7 +13059,7 @@ func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
                v.Aux = s
                v0 := b.NewValue0(x.Pos, OpAMD64MOVQconst, typ.UInt64)
                v0.AuxInt = ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32
-               v.AddArg3(p0, v0, mem)
+               v.AddArg3(p, v0, mem)
                return true
        }
        // match: (MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -13423,118 +13411,6 @@ func rewriteValueAMD64_OpAMD64MOVQload(v *Value) bool {
                v.AddArg2(base, mem)
                return true
        }
-       // match: (MOVQload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVQload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVQload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVQload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
        // match: (MOVQload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
        // cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
        // result: (MOVQload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -13614,7 +13490,6 @@ func rewriteValueAMD64_OpAMD64MOVQstore(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       b := v.Block
        // match: (MOVQstore [off1] {sym} (ADDQconst [off2] ptr) val mem)
        // cond: is32Bit(off1+off2)
        // result: (MOVQstore [off1+off2] {sym} ptr val mem)
@@ -13681,122 +13556,6 @@ func rewriteValueAMD64_OpAMD64MOVQstore(v *Value) bool {
                v.AddArg3(base, val, mem)
                return true
        }
-       // match: (MOVQstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVQstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVQstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVQstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVQstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVQstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
        // match: (MOVQstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
        // cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
        // result: (MOVQstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
@@ -14430,13 +14189,13 @@ func rewriteValueAMD64_OpAMD64MOVQstoreconst(v *Value) bool {
                v.AddArg2(ptr, mem)
                return true
        }
-       // match: (MOVQstoreconst [c] {s} p1 x:(MOVQstoreconst [c2] {s} p0 mem))
-       // cond: config.useSSE && x.Uses == 1 && same(p0, p1, 1) && ValAndOff(c2).Off() + 8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)
-       // result: (MOVOstore [ValAndOff(c2).Off()] {s} p0 (MOVOconst [0]) mem)
+       // match: (MOVQstoreconst [c] {s} p x:(MOVQstoreconst [c2] {s} p mem))
+       // cond: config.useSSE && x.Uses == 1 && ValAndOff(c2).Off() + 8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)
+       // result: (MOVOstore [ValAndOff(c2).Off()] {s} p (MOVOconst [0]) mem)
        for {
                c := v.AuxInt
                s := v.Aux
-               p1 := v_0
+               p := v_0
                x := v_1
                if x.Op != OpAMD64MOVQstoreconst {
                        break
@@ -14446,8 +14205,7 @@ func rewriteValueAMD64_OpAMD64MOVQstoreconst(v *Value) bool {
                        break
                }
                mem := x.Args[1]
-               p0 := x.Args[0]
-               if !(config.useSSE && x.Uses == 1 && same(p0, p1, 1) && ValAndOff(c2).Off()+8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)) {
+               if p != x.Args[0] || !(config.useSSE && x.Uses == 1 && ValAndOff(c2).Off()+8 == ValAndOff(c).Off() && ValAndOff(c).Val() == 0 && ValAndOff(c2).Val() == 0 && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVOstore)
@@ -14455,7 +14213,7 @@ func rewriteValueAMD64_OpAMD64MOVQstoreconst(v *Value) bool {
                v.Aux = s
                v0 := b.NewValue0(x.Pos, OpAMD64MOVOconst, types.TypeInt128)
                v0.AuxInt = 0
-               v.AddArg3(p0, v0, mem)
+               v.AddArg3(p, v0, mem)
                return true
        }
        // match: (MOVQstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -15119,118 +14877,6 @@ func rewriteValueAMD64_OpAMD64MOVWload(v *Value) bool {
                v.AddArg2(base, mem)
                return true
        }
-       // match: (MOVWload [i0] {s0} l:(LEAQ1 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWload [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVWload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVWload [i0] {s0} l:(LEAQ2 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWload [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVWload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVWload [i0] {s0} l:(LEAQ4 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWload [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVWload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
-       // match: (MOVWload [i0] {s0} l:(LEAQ8 [i1] {s1} x y) mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWload [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
-                       break
-               }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               mem := v_1
-               if !(i1 != 0 && is32Bit(i0+i1)) {
-                       break
-               }
-               v.reset(OpAMD64MOVWload)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg2(v0, mem)
-               return true
-       }
        // match: (MOVWload [off1] {sym1} (LEAL [off2] {sym2} base) mem)
        // cond: canMergeSym(sym1, sym2) && is32Bit(off1+off2)
        // result: (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -15416,125 +15062,119 @@ func rewriteValueAMD64_OpAMD64MOVWstore(v *Value) bool {
                v.AddArg3(base, val, mem)
                return true
        }
-       // match: (MOVWstore [i0] {s0} l:(LEAQ1 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWstore [i0+i1] {s0} (LEAQ1 <l.Type> [0] {s1} x y) val mem)
+       // match: (MOVWstore [i] {s} p (SHRLconst [16] w) x:(MOVWstore [i-2] {s} p w mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVLstore [i-2] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRLconst || v_1.AuxInt != 16 {
+                       break
+               }
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVLstore)
+               v.AuxInt = i - 2
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVWstore [i] {s} p (SHRQconst [16] w) x:(MOVWstore [i-2] {s} p w mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVLstore [i-2] {s} p w mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRQconst || v_1.AuxInt != 16 {
+                       break
+               }
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
+                       break
+               }
+               mem := x.Args[2]
+               if p != x.Args[0] || w != x.Args[1] || !(x.Uses == 1 && clobber(x)) {
+                       break
+               }
+               v.reset(OpAMD64MOVLstore)
+               v.AuxInt = i - 2
+               v.Aux = s
+               v.AddArg3(p, w, mem)
+               return true
+       }
+       // match: (MOVWstore [i] {s} p (SHRLconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRLconst [j-16] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVLstore [i-2] {s} p w0 mem)
        for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ1 {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRLconst {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               j := v_1.AuxInt
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ1, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVWstore [i0] {s0} l:(LEAQ2 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWstore [i0+i1] {s0} (LEAQ2 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ2 {
+               mem := x.Args[2]
+               if p != x.Args[0] {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               w0 := x.Args[1]
+               if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ2, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
+               v.reset(OpAMD64MOVLstore)
+               v.AuxInt = i - 2
+               v.Aux = s
+               v.AddArg3(p, w0, mem)
                return true
        }
-       // match: (MOVWstore [i0] {s0} l:(LEAQ4 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWstore [i0+i1] {s0} (LEAQ4 <l.Type> [0] {s1} x y) val mem)
+       // match: (MOVWstore [i] {s} p (SHRQconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRQconst [j-16] w) mem))
+       // cond: x.Uses == 1 && clobber(x)
+       // result: (MOVLstore [i-2] {s} p w0 mem)
        for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ4 {
+               i := v.AuxInt
+               s := v.Aux
+               p := v_0
+               if v_1.Op != OpAMD64SHRQconst {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               j := v_1.AuxInt
+               w := v_1.Args[0]
+               x := v_2
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ4, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
-               return true
-       }
-       // match: (MOVWstore [i0] {s0} l:(LEAQ8 [i1] {s1} x y) val mem)
-       // cond: i1 != 0 && is32Bit(i0+i1)
-       // result: (MOVWstore [i0+i1] {s0} (LEAQ8 <l.Type> [0] {s1} x y) val mem)
-       for {
-               i0 := v.AuxInt
-               s0 := v.Aux
-               l := v_0
-               if l.Op != OpAMD64LEAQ8 {
+               mem := x.Args[2]
+               if p != x.Args[0] {
                        break
                }
-               i1 := l.AuxInt
-               s1 := l.Aux
-               y := l.Args[1]
-               x := l.Args[0]
-               val := v_1
-               mem := v_2
-               if !(i1 != 0 && is32Bit(i0+i1)) {
+               w0 := x.Args[1]
+               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && clobber(x)) {
                        break
                }
-               v.reset(OpAMD64MOVWstore)
-               v.AuxInt = i0 + i1
-               v.Aux = s0
-               v0 := b.NewValue0(v.Pos, OpAMD64LEAQ8, l.Type)
-               v0.AuxInt = 0
-               v0.Aux = s1
-               v0.AddArg2(x, y)
-               v.AddArg3(v0, val, mem)
+               v.reset(OpAMD64MOVLstore)
+               v.AuxInt = i - 2
+               v.Aux = s
+               v.AddArg3(p, w0, mem)
                return true
        }
-       // match: (MOVWstore [i] {s} p1 (SHRLconst [16] w) x:(MOVWstore [i-2] {s} p0 w mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVLstore [i-2] {s} p0 w mem)
+       // match: (MOVWstore [i] {s} p1 (SHRLconst [16] w) x:(MOVWstore [i] {s} p0 w mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+       // result: (MOVLstore [i] {s} p0 w mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -15544,23 +15184,23 @@ func rewriteValueAMD64_OpAMD64MOVWstore(v *Value) bool {
                }
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
-               if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i - 2
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w, mem)
                return true
        }
-       // match: (MOVWstore [i] {s} p1 (SHRQconst [16] w) x:(MOVWstore [i-2] {s} p0 w mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVLstore [i-2] {s} p0 w mem)
+       // match: (MOVWstore [i] {s} p1 (SHRQconst [16] w) x:(MOVWstore [i] {s} p0 w mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+       // result: (MOVLstore [i] {s} p0 w mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -15570,23 +15210,23 @@ func rewriteValueAMD64_OpAMD64MOVWstore(v *Value) bool {
                }
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
-               if w != x.Args[1] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w != x.Args[1] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i - 2
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w, mem)
                return true
        }
-       // match: (MOVWstore [i] {s} p1 (SHRLconst [j] w) x:(MOVWstore [i-2] {s} p0 w0:(SHRLconst [j-16] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVLstore [i-2] {s} p0 w0 mem)
+       // match: (MOVWstore [i] {s} p1 (SHRLconst [j] w) x:(MOVWstore [i] {s} p0 w0:(SHRLconst [j-16] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+       // result: (MOVLstore [i] {s} p0 w0 mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -15597,24 +15237,24 @@ func rewriteValueAMD64_OpAMD64MOVWstore(v *Value) bool {
                j := v_1.AuxInt
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
                w0 := x.Args[1]
-               if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w0.Op != OpAMD64SHRLconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i - 2
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w0, mem)
                return true
        }
-       // match: (MOVWstore [i] {s} p1 (SHRQconst [j] w) x:(MOVWstore [i-2] {s} p0 w0:(SHRQconst [j-16] w) mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && clobber(x)
-       // result: (MOVLstore [i-2] {s} p0 w0 mem)
+       // match: (MOVWstore [i] {s} p1 (SHRQconst [j] w) x:(MOVWstore [i] {s} p0 w0:(SHRQconst [j-16] w) mem))
+       // cond: x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)
+       // result: (MOVLstore [i] {s} p0 w0 mem)
        for {
                i := v.AuxInt
                s := v.Aux
@@ -15625,17 +15265,17 @@ func rewriteValueAMD64_OpAMD64MOVWstore(v *Value) bool {
                j := v_1.AuxInt
                w := v_1.Args[0]
                x := v_2
-               if x.Op != OpAMD64MOVWstore || x.AuxInt != i-2 || x.Aux != s {
+               if x.Op != OpAMD64MOVWstore || x.AuxInt != i || x.Aux != s {
                        break
                }
                mem := x.Args[2]
                p0 := x.Args[0]
                w0 := x.Args[1]
-               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && same(p0, p1, 1) && clobber(x)) {
+               if w0.Op != OpAMD64SHRQconst || w0.AuxInt != j-16 || w != w0.Args[0] || !(x.Uses == 1 && sequentialAddresses(p0, p1, 2) && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVLstore)
-               v.AuxInt = i - 2
+               v.AuxInt = i
                v.Aux = s
                v.AddArg3(p0, w0, mem)
                return true
@@ -15774,13 +15414,13 @@ func rewriteValueAMD64_OpAMD64MOVWstoreconst(v *Value) bool {
                v.AddArg2(ptr, mem)
                return true
        }
-       // match: (MOVWstoreconst [c] {s} p1 x:(MOVWstoreconst [a] {s} p0 mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
-       // result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
+       // match: (MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
+       // cond: x.Uses == 1 && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
+       // result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
        for {
                c := v.AuxInt
                s := v.Aux
-               p1 := v_0
+               p := v_0
                x := v_1
                if x.Op != OpAMD64MOVWstoreconst {
                        break
@@ -15790,23 +15430,22 @@ func rewriteValueAMD64_OpAMD64MOVWstoreconst(v *Value) bool {
                        break
                }
                mem := x.Args[1]
-               p0 := x.Args[0]
-               if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
+               if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVLstoreconst)
                v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xffff|ValAndOff(c).Val()<<16, ValAndOff(a).Off())
                v.Aux = s
-               v.AddArg2(p0, mem)
+               v.AddArg2(p, mem)
                return true
        }
-       // match: (MOVWstoreconst [a] {s} p1 x:(MOVWstoreconst [c] {s} p0 mem))
-       // cond: x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
-       // result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p0 mem)
+       // match: (MOVWstoreconst [a] {s} p x:(MOVWstoreconst [c] {s} p mem))
+       // cond: x.Uses == 1 && ValAndOff(a).Off() + 2 == ValAndOff(c).Off() && clobber(x)
+       // result: (MOVLstoreconst [makeValAndOff(ValAndOff(a).Val()&0xffff | ValAndOff(c).Val()<<16, ValAndOff(a).Off())] {s} p mem)
        for {
                a := v.AuxInt
                s := v.Aux
-               p1 := v_0
+               p := v_0
                x := v_1
                if x.Op != OpAMD64MOVWstoreconst {
                        break
@@ -15816,14 +15455,13 @@ func rewriteValueAMD64_OpAMD64MOVWstoreconst(v *Value) bool {
                        break
                }
                mem := x.Args[1]
-               p0 := x.Args[0]
-               if !(x.Uses == 1 && same(p0, p1, 1) && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
+               if p != x.Args[0] || !(x.Uses == 1 && ValAndOff(a).Off()+2 == ValAndOff(c).Off() && clobber(x)) {
                        break
                }
                v.reset(OpAMD64MOVLstoreconst)
                v.AuxInt = makeValAndOff(ValAndOff(a).Val()&0xffff|ValAndOff(c).Val()<<16, ValAndOff(a).Off())
                v.Aux = s
-               v.AddArg2(p0, mem)
+               v.AddArg2(p, mem)
                return true
        }
        // match: (MOVWstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem)
@@ -17887,9 +17525,9 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                v.copyOf(x)
                return true
        }
-       // match: (ORL x0:(MOVBload [i0] {s} p0 mem) sh:(SHLLconst [8] x1:(MOVBload [i1] {s} p1 mem)))
-       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-       // result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+       // match: (ORL x0:(MOVBload [i0] {s} p mem) sh:(SHLLconst [8] x1:(MOVBload [i1] {s} p mem)))
+       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        x0 := v_0
@@ -17899,7 +17537,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
                                continue
@@ -17913,8 +17551,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                continue
                        }
                        _ = x1.Args[1]
-                       p1 := x1.Args[0]
-                       if mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                       if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -17922,14 +17559,50 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        v.copyOf(v0)
                        v0.AuxInt = i0
                        v0.Aux = s
+                       v0.AddArg2(p, mem)
+                       return true
+               }
+               break
+       }
+       // match: (ORL x0:(MOVBload [i] {s} p0 mem) sh:(SHLLconst [8] x1:(MOVBload [i] {s} p1 mem)))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x0 := v_0
+                       if x0.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
+                               continue
+                       }
+                       x1 := sh.Args[0]
+                       if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+                               continue
+                       }
+                       _ = x1.Args[1]
+                       p1 := x1.Args[0]
+                       if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+                       v.copyOf(v0)
+                       v0.AuxInt = i
+                       v0.Aux = s
                        v0.AddArg2(p0, mem)
                        return true
                }
                break
        }
-       // match: (ORL x0:(MOVWload [i0] {s} p0 mem) sh:(SHLLconst [16] x1:(MOVWload [i1] {s} p1 mem)))
-       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-       // result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+       // match: (ORL x0:(MOVWload [i0] {s} p mem) sh:(SHLLconst [16] x1:(MOVWload [i1] {s} p mem)))
+       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        x0 := v_0
@@ -17939,7 +17612,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
                                continue
@@ -17953,8 +17626,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                continue
                        }
                        _ = x1.Args[1]
-                       p1 := x1.Args[0]
-                       if mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                       if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -17962,14 +17634,50 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        v.copyOf(v0)
                        v0.AuxInt = i0
                        v0.Aux = s
+                       v0.AddArg2(p, mem)
+                       return true
+               }
+               break
+       }
+       // match: (ORL x0:(MOVWload [i] {s} p0 mem) sh:(SHLLconst [16] x1:(MOVWload [i] {s} p1 mem)))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x0 := v_0
+                       if x0.Op != OpAMD64MOVWload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
+                               continue
+                       }
+                       x1 := sh.Args[0]
+                       if x1.Op != OpAMD64MOVWload || x1.AuxInt != i || x1.Aux != s {
+                               continue
+                       }
+                       _ = x1.Args[1]
+                       p1 := x1.Args[0]
+                       if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
+                       v.copyOf(v0)
+                       v0.AuxInt = i
+                       v0.Aux = s
                        v0.AddArg2(p0, mem)
                        return true
                }
                break
        }
-       // match: (ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p0 mem)) or:(ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p1 mem)) y))
-       // cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-       // result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+       // match: (ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p mem)) or:(ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p mem)) y))
+       // cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        s1 := v_0
@@ -17984,7 +17692,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        i1 := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p := x1.Args[0]
                        or := v_1
                        if or.Op != OpAMD64ORL {
                                continue
@@ -18007,12 +17715,11 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                        continue
                                }
                                _ = x0.Args[1]
-                               p1 := x0.Args[0]
-                               if mem != x0.Args[1] {
+                               if p != x0.Args[0] || mem != x0.Args[1] {
                                        continue
                                }
                                y := or_1
-                               if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                               if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
                                        continue
                                }
                                b = mergePoint(b, x0, x1, y)
@@ -18023,6 +17730,66 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
                                v2.AuxInt = i0
                                v2.Aux = s
+                               v2.AddArg2(p, mem)
+                               v1.AddArg(v2)
+                               v0.AddArg2(v1, y)
+                               return true
+                       }
+               }
+               break
+       }
+       // match: (ORL s1:(SHLLconst [j1] x1:(MOVBload [i] {s} p1 mem)) or:(ORL s0:(SHLLconst [j0] x0:(MOVBload [i] {s} p0 mem)) y))
+       // cond: j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       s1 := v_0
+                       if s1.Op != OpAMD64SHLLconst {
+                               continue
+                       }
+                       j1 := s1.AuxInt
+                       x1 := s1.Args[0]
+                       if x1.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
+                       p1 := x1.Args[0]
+                       or := v_1
+                       if or.Op != OpAMD64ORL {
+                               continue
+                       }
+                       _ = or.Args[1]
+                       or_0 := or.Args[0]
+                       or_1 := or.Args[1]
+                       for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+                               s0 := or_0
+                               if s0.Op != OpAMD64SHLLconst {
+                                       continue
+                               }
+                               j0 := s0.AuxInt
+                               x0 := s0.Args[0]
+                               if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+                                       continue
+                               }
+                               _ = x0.Args[1]
+                               p0 := x0.Args[0]
+                               if mem != x0.Args[1] {
+                                       continue
+                               }
+                               y := or_1
+                               if !(j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                                       continue
+                               }
+                               b = mergePoint(b, x0, x1, y)
+                               v0 := b.NewValue0(x0.Pos, OpAMD64ORL, v.Type)
+                               v.copyOf(v0)
+                               v1 := b.NewValue0(x0.Pos, OpAMD64SHLLconst, v.Type)
+                               v1.AuxInt = j0
+                               v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+                               v2.AuxInt = i
+                               v2.Aux = s
                                v2.AddArg2(p0, mem)
                                v1.AddArg(v2)
                                v0.AddArg2(v1, y)
@@ -18031,9 +17798,9 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                }
                break
        }
-       // match: (ORL x1:(MOVBload [i1] {s} p0 mem) sh:(SHLLconst [8] x0:(MOVBload [i0] {s} p1 mem)))
-       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-       // result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+       // match: (ORL x1:(MOVBload [i1] {s} p mem) sh:(SHLLconst [8] x0:(MOVBload [i0] {s} p mem)))
+       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        x1 := v_0
@@ -18043,7 +17810,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        i1 := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p := x1.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
                                continue
@@ -18057,8 +17824,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                continue
                        }
                        _ = x0.Args[1]
-                       p1 := x0.Args[0]
-                       if mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                       if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -18068,15 +17834,54 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
                        v1.AuxInt = i0
                        v1.Aux = s
+                       v1.AddArg2(p, mem)
+                       v0.AddArg(v1)
+                       return true
+               }
+               break
+       }
+       // match: (ORL x1:(MOVBload [i] {s} p1 mem) sh:(SHLLconst [8] x0:(MOVBload [i] {s} p0 mem)))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x1 := v_0
+                       if x1.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
+                       p1 := x1.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 8 {
+                               continue
+                       }
+                       x0 := sh.Args[0]
+                       if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+                               continue
+                       }
+                       _ = x0.Args[1]
+                       p0 := x0.Args[0]
+                       if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, v.Type)
+                       v.copyOf(v0)
+                       v0.AuxInt = 8
+                       v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+                       v1.AuxInt = i
+                       v1.Aux = s
                        v1.AddArg2(p0, mem)
                        v0.AddArg(v1)
                        return true
                }
                break
        }
-       // match: (ORL r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem)) sh:(SHLLconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
-       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
-       // result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+       // match: (ORL r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)) sh:(SHLLconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
+       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+       // result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        r1 := v_0
@@ -18090,7 +17895,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        i1 := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p := x1.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
                                continue
@@ -18108,15 +17913,60 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                continue
                        }
                        _ = x0.Args[1]
-                       p1 := x0.Args[0]
-                       if mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+                       if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, v.Type)
+                       v.copyOf(v0)
+                       v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
+                       v1.AuxInt = i0
+                       v1.Aux = s
+                       v1.AddArg2(p, mem)
+                       v0.AddArg(v1)
+                       return true
+               }
+               break
+       }
+       // match: (ORL r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)) sh:(SHLLconst [16] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+       // result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       r1 := v_0
+                       if r1.Op != OpAMD64ROLWconst || r1.AuxInt != 8 {
+                               continue
+                       }
+                       x1 := r1.Args[0]
+                       if x1.Op != OpAMD64MOVWload {
+                               continue
+                       }
+                       i := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
+                       p1 := x1.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLLconst || sh.AuxInt != 16 {
+                               continue
+                       }
+                       r0 := sh.Args[0]
+                       if r0.Op != OpAMD64ROLWconst || r0.AuxInt != 8 {
+                               continue
+                       }
+                       x0 := r0.Args[0]
+                       if x0.Op != OpAMD64MOVWload || x0.AuxInt != i || x0.Aux != s {
+                               continue
+                       }
+                       _ = x0.Args[1]
+                       p0 := x0.Args[0]
+                       if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
                        v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, v.Type)
                        v.copyOf(v0)
                        v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
-                       v1.AuxInt = i0
+                       v1.AuxInt = i
                        v1.Aux = s
                        v1.AddArg2(p0, mem)
                        v0.AddArg(v1)
@@ -18124,9 +17974,9 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                }
                break
        }
-       // match: (ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p0 mem)) or:(ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p1 mem)) y))
-       // cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-       // result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+       // match: (ORL s0:(SHLLconst [j0] x0:(MOVBload [i0] {s} p mem)) or:(ORL s1:(SHLLconst [j1] x1:(MOVBload [i1] {s} p mem)) y))
+       // cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        s0 := v_0
@@ -18141,7 +17991,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        or := v_1
                        if or.Op != OpAMD64ORL {
                                continue
@@ -18164,12 +18014,74 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                        continue
                                }
                                _ = x1.Args[1]
+                               if p != x1.Args[0] || mem != x1.Args[1] {
+                                       continue
+                               }
+                               y := or_1
+                               if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                                       continue
+                               }
+                               b = mergePoint(b, x0, x1, y)
+                               v0 := b.NewValue0(x1.Pos, OpAMD64ORL, v.Type)
+                               v.copyOf(v0)
+                               v1 := b.NewValue0(x1.Pos, OpAMD64SHLLconst, v.Type)
+                               v1.AuxInt = j1
+                               v2 := b.NewValue0(x1.Pos, OpAMD64ROLWconst, typ.UInt16)
+                               v2.AuxInt = 8
+                               v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+                               v3.AuxInt = i0
+                               v3.Aux = s
+                               v3.AddArg2(p, mem)
+                               v2.AddArg(v3)
+                               v1.AddArg(v2)
+                               v0.AddArg2(v1, y)
+                               return true
+                       }
+               }
+               break
+       }
+       // match: (ORL s0:(SHLLconst [j0] x0:(MOVBload [i] {s} p0 mem)) or:(ORL s1:(SHLLconst [j1] x1:(MOVBload [i] {s} p1 mem)) y))
+       // cond: j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORL <v.Type> (SHLLconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       s0 := v_0
+                       if s0.Op != OpAMD64SHLLconst {
+                               continue
+                       }
+                       j0 := s0.AuxInt
+                       x0 := s0.Args[0]
+                       if x0.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       or := v_1
+                       if or.Op != OpAMD64ORL {
+                               continue
+                       }
+                       _ = or.Args[1]
+                       or_0 := or.Args[0]
+                       or_1 := or.Args[1]
+                       for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+                               s1 := or_0
+                               if s1.Op != OpAMD64SHLLconst {
+                                       continue
+                               }
+                               j1 := s1.AuxInt
+                               x1 := s1.Args[0]
+                               if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+                                       continue
+                               }
+                               _ = x1.Args[1]
                                p1 := x1.Args[0]
                                if mem != x1.Args[1] {
                                        continue
                                }
                                y := or_1
-                               if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                               if !(j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
                                        continue
                                }
                                b = mergePoint(b, x0, x1, y)
@@ -18180,7 +18092,7 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value) bool {
                                v2 := b.NewValue0(x1.Pos, OpAMD64ROLWconst, typ.UInt16)
                                v2.AuxInt = 8
                                v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
-                               v3.AuxInt = i0
+                               v3.AuxInt = i
                                v3.Aux = s
                                v3.AddArg2(p0, mem)
                                v2.AddArg(v3)
@@ -18804,9 +18716,9 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                v.copyOf(x)
                return true
        }
-       // match: (ORQ x0:(MOVBload [i0] {s} p0 mem) sh:(SHLQconst [8] x1:(MOVBload [i1] {s} p1 mem)))
-       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-       // result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p0 mem)
+       // match: (ORQ x0:(MOVBload [i0] {s} p mem) sh:(SHLQconst [8] x1:(MOVBload [i1] {s} p mem)))
+       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVWload [i0] {s} p mem)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        x0 := v_0
@@ -18816,7 +18728,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
                                continue
@@ -18830,8 +18742,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                continue
                        }
                        _ = x1.Args[1]
-                       p1 := x1.Args[0]
-                       if mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                       if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -18839,14 +18750,50 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        v.copyOf(v0)
                        v0.AuxInt = i0
                        v0.Aux = s
+                       v0.AddArg2(p, mem)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ x0:(MOVBload [i] {s} p0 mem) sh:(SHLQconst [8] x1:(MOVBload [i] {s} p1 mem)))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVWload [i] {s} p0 mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x0 := v_0
+                       if x0.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
+                               continue
+                       }
+                       x1 := sh.Args[0]
+                       if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+                               continue
+                       }
+                       _ = x1.Args[1]
+                       p1 := x1.Args[0]
+                       if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+                       v.copyOf(v0)
+                       v0.AuxInt = i
+                       v0.Aux = s
                        v0.AddArg2(p0, mem)
                        return true
                }
                break
        }
-       // match: (ORQ x0:(MOVWload [i0] {s} p0 mem) sh:(SHLQconst [16] x1:(MOVWload [i1] {s} p1 mem)))
-       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-       // result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p0 mem)
+       // match: (ORQ x0:(MOVWload [i0] {s} p mem) sh:(SHLQconst [16] x1:(MOVWload [i1] {s} p mem)))
+       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVLload [i0] {s} p mem)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        x0 := v_0
@@ -18856,7 +18803,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
                                continue
@@ -18870,8 +18817,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                continue
                        }
                        _ = x1.Args[1]
-                       p1 := x1.Args[0]
-                       if mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                       if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -18879,14 +18825,50 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        v.copyOf(v0)
                        v0.AuxInt = i0
                        v0.Aux = s
+                       v0.AddArg2(p, mem)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ x0:(MOVWload [i] {s} p0 mem) sh:(SHLQconst [16] x1:(MOVWload [i] {s} p1 mem)))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVLload [i] {s} p0 mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x0 := v_0
+                       if x0.Op != OpAMD64MOVWload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
+                               continue
+                       }
+                       x1 := sh.Args[0]
+                       if x1.Op != OpAMD64MOVWload || x1.AuxInt != i || x1.Aux != s {
+                               continue
+                       }
+                       _ = x1.Args[1]
+                       p1 := x1.Args[0]
+                       if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
+                       v.copyOf(v0)
+                       v0.AuxInt = i
+                       v0.Aux = s
                        v0.AddArg2(p0, mem)
                        return true
                }
                break
        }
-       // match: (ORQ x0:(MOVLload [i0] {s} p0 mem) sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p1 mem)))
-       // cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-       // result: @mergePoint(b,x0,x1) (MOVQload [i0] {s} p0 mem)
+       // match: (ORQ x0:(MOVLload [i0] {s} p mem) sh:(SHLQconst [32] x1:(MOVLload [i1] {s} p mem)))
+       // cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVQload [i0] {s} p mem)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        x0 := v_0
@@ -18896,7 +18878,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
                                continue
@@ -18910,23 +18892,181 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                continue
                        }
                        _ = x1.Args[1]
+                       if p != x1.Args[0] || mem != x1.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x1.Pos, OpAMD64MOVQload, typ.UInt64)
+                       v.copyOf(v0)
+                       v0.AuxInt = i0
+                       v0.Aux = s
+                       v0.AddArg2(p, mem)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ x0:(MOVLload [i] {s} p0 mem) sh:(SHLQconst [32] x1:(MOVLload [i] {s} p1 mem)))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (MOVQload [i] {s} p0 mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x0 := v_0
+                       if x0.Op != OpAMD64MOVLload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
+                               continue
+                       }
+                       x1 := sh.Args[0]
+                       if x1.Op != OpAMD64MOVLload || x1.AuxInt != i || x1.Aux != s {
+                               continue
+                       }
+                       _ = x1.Args[1]
+                       p1 := x1.Args[0]
+                       if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x1.Pos, OpAMD64MOVQload, typ.UInt64)
+                       v.copyOf(v0)
+                       v0.AuxInt = i
+                       v0.Aux = s
+                       v0.AddArg2(p0, mem)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p mem)) y))
+       // cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i0] {s} p mem)) y)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       s1 := v_0
+                       if s1.Op != OpAMD64SHLQconst {
+                               continue
+                       }
+                       j1 := s1.AuxInt
+                       x1 := s1.Args[0]
+                       if x1.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i1 := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
+                       p := x1.Args[0]
+                       or := v_1
+                       if or.Op != OpAMD64ORQ {
+                               continue
+                       }
+                       _ = or.Args[1]
+                       or_0 := or.Args[0]
+                       or_1 := or.Args[1]
+                       for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+                               s0 := or_0
+                               if s0.Op != OpAMD64SHLQconst {
+                                       continue
+                               }
+                               j0 := s0.AuxInt
+                               x0 := s0.Args[0]
+                               if x0.Op != OpAMD64MOVBload {
+                                       continue
+                               }
+                               i0 := x0.AuxInt
+                               if x0.Aux != s {
+                                       continue
+                               }
+                               _ = x0.Args[1]
+                               if p != x0.Args[0] || mem != x0.Args[1] {
+                                       continue
+                               }
+                               y := or_1
+                               if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                                       continue
+                               }
+                               b = mergePoint(b, x0, x1, y)
+                               v0 := b.NewValue0(x0.Pos, OpAMD64ORQ, v.Type)
+                               v.copyOf(v0)
+                               v1 := b.NewValue0(x0.Pos, OpAMD64SHLQconst, v.Type)
+                               v1.AuxInt = j0
+                               v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+                               v2.AuxInt = i0
+                               v2.Aux = s
+                               v2.AddArg2(p, mem)
+                               v1.AddArg(v2)
+                               v0.AddArg2(v1, y)
+                               return true
+                       }
+               }
+               break
+       }
+       // match: (ORQ s1:(SHLQconst [j1] x1:(MOVBload [i] {s} p1 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVBload [i] {s} p0 mem)) y))
+       // cond: j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i] {s} p0 mem)) y)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       s1 := v_0
+                       if s1.Op != OpAMD64SHLQconst {
+                               continue
+                       }
+                       j1 := s1.AuxInt
+                       x1 := s1.Args[0]
+                       if x1.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
                        p1 := x1.Args[0]
-                       if mem != x1.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                       or := v_1
+                       if or.Op != OpAMD64ORQ {
                                continue
                        }
-                       b = mergePoint(b, x0, x1)
-                       v0 := b.NewValue0(x1.Pos, OpAMD64MOVQload, typ.UInt64)
-                       v.copyOf(v0)
-                       v0.AuxInt = i0
-                       v0.Aux = s
-                       v0.AddArg2(p0, mem)
-                       return true
+                       _ = or.Args[1]
+                       or_0 := or.Args[0]
+                       or_1 := or.Args[1]
+                       for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+                               s0 := or_0
+                               if s0.Op != OpAMD64SHLQconst {
+                                       continue
+                               }
+                               j0 := s0.AuxInt
+                               x0 := s0.Args[0]
+                               if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+                                       continue
+                               }
+                               _ = x0.Args[1]
+                               p0 := x0.Args[0]
+                               if mem != x0.Args[1] {
+                                       continue
+                               }
+                               y := or_1
+                               if !(j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                                       continue
+                               }
+                               b = mergePoint(b, x0, x1, y)
+                               v0 := b.NewValue0(x0.Pos, OpAMD64ORQ, v.Type)
+                               v.copyOf(v0)
+                               v1 := b.NewValue0(x0.Pos, OpAMD64SHLQconst, v.Type)
+                               v1.AuxInt = j0
+                               v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+                               v2.AuxInt = i
+                               v2.Aux = s
+                               v2.AddArg2(p0, mem)
+                               v1.AddArg(v2)
+                               v0.AddArg2(v1, y)
+                               return true
+                       }
                }
                break
        }
-       // match: (ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p0 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p1 mem)) y))
-       // cond: i1 == i0+1 && j1 == j0+8 && j0 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVWload [i0] {s} p0 mem)) y)
+       // match: (ORQ s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p mem)) y))
+       // cond: i1 == i0+2 && j1 == j0+16 && j0 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p mem)) y)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        s1 := v_0
@@ -18935,13 +19075,13 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        }
                        j1 := s1.AuxInt
                        x1 := s1.Args[0]
-                       if x1.Op != OpAMD64MOVBload {
+                       if x1.Op != OpAMD64MOVWload {
                                continue
                        }
                        i1 := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p := x1.Args[0]
                        or := v_1
                        if or.Op != OpAMD64ORQ {
                                continue
@@ -18956,7 +19096,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                }
                                j0 := s0.AuxInt
                                x0 := s0.Args[0]
-                               if x0.Op != OpAMD64MOVBload {
+                               if x0.Op != OpAMD64MOVWload {
                                        continue
                                }
                                i0 := x0.AuxInt
@@ -18964,12 +19104,11 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                        continue
                                }
                                _ = x0.Args[1]
-                               p1 := x0.Args[0]
-                               if mem != x0.Args[1] {
+                               if p != x0.Args[0] || mem != x0.Args[1] {
                                        continue
                                }
                                y := or_1
-                               if !(i1 == i0+1 && j1 == j0+8 && j0%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                               if !(i1 == i0+2 && j1 == j0+16 && j0%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
                                        continue
                                }
                                b = mergePoint(b, x0, x1, y)
@@ -18977,10 +19116,10 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                v.copyOf(v0)
                                v1 := b.NewValue0(x0.Pos, OpAMD64SHLQconst, v.Type)
                                v1.AuxInt = j0
-                               v2 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+                               v2 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
                                v2.AuxInt = i0
                                v2.Aux = s
-                               v2.AddArg2(p0, mem)
+                               v2.AddArg2(p, mem)
                                v1.AddArg(v2)
                                v0.AddArg2(v1, y)
                                return true
@@ -18988,9 +19127,9 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                }
                break
        }
-       // match: (ORQ s1:(SHLQconst [j1] x1:(MOVWload [i1] {s} p0 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVWload [i0] {s} p1 mem)) y))
-       // cond: i1 == i0+2 && j1 == j0+16 && j0 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i0] {s} p0 mem)) y)
+       // match: (ORQ s1:(SHLQconst [j1] x1:(MOVWload [i] {s} p1 mem)) or:(ORQ s0:(SHLQconst [j0] x0:(MOVWload [i] {s} p0 mem)) y))
+       // cond: j1 == j0+16 && j0 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j0] (MOVLload [i] {s} p0 mem)) y)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        s1 := v_0
@@ -19002,10 +19141,10 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        if x1.Op != OpAMD64MOVWload {
                                continue
                        }
-                       i1 := x1.AuxInt
+                       i := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p1 := x1.Args[0]
                        or := v_1
                        if or.Op != OpAMD64ORQ {
                                continue
@@ -19020,20 +19159,16 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                }
                                j0 := s0.AuxInt
                                x0 := s0.Args[0]
-                               if x0.Op != OpAMD64MOVWload {
-                                       continue
-                               }
-                               i0 := x0.AuxInt
-                               if x0.Aux != s {
+                               if x0.Op != OpAMD64MOVWload || x0.AuxInt != i || x0.Aux != s {
                                        continue
                                }
                                _ = x0.Args[1]
-                               p1 := x0.Args[0]
+                               p0 := x0.Args[0]
                                if mem != x0.Args[1] {
                                        continue
                                }
                                y := or_1
-                               if !(i1 == i0+2 && j1 == j0+16 && j0%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                               if !(j1 == j0+16 && j0%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
                                        continue
                                }
                                b = mergePoint(b, x0, x1, y)
@@ -19042,7 +19177,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                v1 := b.NewValue0(x0.Pos, OpAMD64SHLQconst, v.Type)
                                v1.AuxInt = j0
                                v2 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
-                               v2.AuxInt = i0
+                               v2.AuxInt = i
                                v2.Aux = s
                                v2.AddArg2(p0, mem)
                                v1.AddArg(v2)
@@ -19052,9 +19187,9 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                }
                break
        }
-       // match: (ORQ x1:(MOVBload [i1] {s} p0 mem) sh:(SHLQconst [8] x0:(MOVBload [i0] {s} p1 mem)))
-       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
-       // result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p0 mem))
+       // match: (ORQ x1:(MOVBload [i1] {s} p mem) sh:(SHLQconst [8] x0:(MOVBload [i0] {s} p mem)))
+       // cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i0] {s} p mem))
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        x1 := v_0
@@ -19064,7 +19199,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i1 := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p := x1.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
                                continue
@@ -19078,8 +19213,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                continue
                        }
                        _ = x0.Args[1]
-                       p1 := x0.Args[0]
-                       if mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                       if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -19089,15 +19223,54 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
                        v1.AuxInt = i0
                        v1.Aux = s
+                       v1.AddArg2(p, mem)
+                       v0.AddArg(v1)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ x1:(MOVBload [i] {s} p1 mem) sh:(SHLQconst [8] x0:(MOVBload [i] {s} p0 mem)))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
+       // result: @mergePoint(b,x0,x1) (ROLWconst <v.Type> [8] (MOVWload [i] {s} p0 mem))
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x1 := v_0
+                       if x1.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
+                       p1 := x1.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 8 {
+                               continue
+                       }
+                       x0 := sh.Args[0]
+                       if x0.Op != OpAMD64MOVBload || x0.AuxInt != i || x0.Aux != s {
+                               continue
+                       }
+                       _ = x0.Args[1]
+                       p0 := x0.Args[0]
+                       if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x0.Pos, OpAMD64ROLWconst, v.Type)
+                       v.copyOf(v0)
+                       v0.AuxInt = 8
+                       v1 := b.NewValue0(x0.Pos, OpAMD64MOVWload, typ.UInt16)
+                       v1.AuxInt = i
+                       v1.Aux = s
                        v1.AddArg2(p0, mem)
                        v0.AddArg(v1)
                        return true
                }
                break
        }
-       // match: (ORQ r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p0 mem)) sh:(SHLQconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p1 mem))))
-       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
-       // result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p0 mem))
+       // match: (ORQ r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem)) sh:(SHLQconst [16] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))))
+       // cond: i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+       // result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i0] {s} p mem))
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        r1 := v_0
@@ -19111,7 +19284,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i1 := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p := x1.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
                                continue
@@ -19129,8 +19302,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                continue
                        }
                        _ = x0.Args[1]
-                       p1 := x0.Args[0]
-                       if mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+                       if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+2 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -19139,15 +19311,61 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
                        v1.AuxInt = i0
                        v1.Aux = s
+                       v1.AddArg2(p, mem)
+                       v0.AddArg(v1)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem)) sh:(SHLQconst [16] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+       // result: @mergePoint(b,x0,x1) (BSWAPL <v.Type> (MOVLload [i] {s} p0 mem))
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       r1 := v_0
+                       if r1.Op != OpAMD64ROLWconst || r1.AuxInt != 8 {
+                               continue
+                       }
+                       x1 := r1.Args[0]
+                       if x1.Op != OpAMD64MOVWload {
+                               continue
+                       }
+                       i := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
+                       p1 := x1.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 16 {
+                               continue
+                       }
+                       r0 := sh.Args[0]
+                       if r0.Op != OpAMD64ROLWconst || r0.AuxInt != 8 {
+                               continue
+                       }
+                       x0 := r0.Args[0]
+                       if x0.Op != OpAMD64MOVWload || x0.AuxInt != i || x0.Aux != s {
+                               continue
+                       }
+                       _ = x0.Args[1]
+                       p0 := x0.Args[0]
+                       if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPL, v.Type)
+                       v.copyOf(v0)
+                       v1 := b.NewValue0(x0.Pos, OpAMD64MOVLload, typ.UInt32)
+                       v1.AuxInt = i
+                       v1.Aux = s
                        v1.AddArg2(p0, mem)
                        v0.AddArg(v1)
                        return true
                }
                break
        }
-       // match: (ORQ r1:(BSWAPL x1:(MOVLload [i1] {s} p0 mem)) sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i0] {s} p1 mem))))
-       // cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
-       // result: @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p0 mem))
+       // match: (ORQ r1:(BSWAPL x1:(MOVLload [i1] {s} p mem)) sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i0] {s} p mem))))
+       // cond: i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+       // result: @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i0] {s} p mem))
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        r1 := v_0
@@ -19161,7 +19379,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i1 := x1.AuxInt
                        s := x1.Aux
                        mem := x1.Args[1]
-                       p0 := x1.Args[0]
+                       p := x1.Args[0]
                        sh := v_1
                        if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
                                continue
@@ -19179,8 +19397,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                continue
                        }
                        _ = x0.Args[1]
-                       p1 := x0.Args[0]
-                       if mem != x0.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+                       if p != x0.Args[0] || mem != x0.Args[1] || !(i1 == i0+4 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
                                continue
                        }
                        b = mergePoint(b, x0, x1)
@@ -19189,15 +19406,61 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        v1 := b.NewValue0(x0.Pos, OpAMD64MOVQload, typ.UInt64)
                        v1.AuxInt = i0
                        v1.Aux = s
+                       v1.AddArg2(p, mem)
+                       v0.AddArg(v1)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ r1:(BSWAPL x1:(MOVLload [i] {s} p1 mem)) sh:(SHLQconst [32] r0:(BSWAPL x0:(MOVLload [i] {s} p0 mem))))
+       // cond: x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, r0, r1, sh)
+       // result: @mergePoint(b,x0,x1) (BSWAPQ <v.Type> (MOVQload [i] {s} p0 mem))
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       r1 := v_0
+                       if r1.Op != OpAMD64BSWAPL {
+                               continue
+                       }
+                       x1 := r1.Args[0]
+                       if x1.Op != OpAMD64MOVLload {
+                               continue
+                       }
+                       i := x1.AuxInt
+                       s := x1.Aux
+                       mem := x1.Args[1]
+                       p1 := x1.Args[0]
+                       sh := v_1
+                       if sh.Op != OpAMD64SHLQconst || sh.AuxInt != 32 {
+                               continue
+                       }
+                       r0 := sh.Args[0]
+                       if r0.Op != OpAMD64BSWAPL {
+                               continue
+                       }
+                       x0 := r0.Args[0]
+                       if x0.Op != OpAMD64MOVLload || x0.AuxInt != i || x0.Aux != s {
+                               continue
+                       }
+                       _ = x0.Args[1]
+                       p0 := x0.Args[0]
+                       if mem != x0.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p0, p1, 4) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, r0, r1, sh)) {
+                               continue
+                       }
+                       b = mergePoint(b, x0, x1)
+                       v0 := b.NewValue0(x0.Pos, OpAMD64BSWAPQ, v.Type)
+                       v.copyOf(v0)
+                       v1 := b.NewValue0(x0.Pos, OpAMD64MOVQload, typ.UInt64)
+                       v1.AuxInt = i
+                       v1.Aux = s
                        v1.AddArg2(p0, mem)
                        v0.AddArg(v1)
                        return true
                }
                break
        }
-       // match: (ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p0 mem)) or:(ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p1 mem)) y))
-       // cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
-       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p0 mem))) y)
+       // match: (ORQ s0:(SHLQconst [j0] x0:(MOVBload [i0] {s} p mem)) or:(ORQ s1:(SHLQconst [j1] x1:(MOVBload [i1] {s} p mem)) y))
+       // cond: i1 == i0+1 && j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i0] {s} p mem))) y)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        s0 := v_0
@@ -19212,7 +19475,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        or := v_1
                        if or.Op != OpAMD64ORQ {
                                continue
@@ -19235,12 +19498,74 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                        continue
                                }
                                _ = x1.Args[1]
+                               if p != x1.Args[0] || mem != x1.Args[1] {
+                                       continue
+                               }
+                               y := or_1
+                               if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                                       continue
+                               }
+                               b = mergePoint(b, x0, x1, y)
+                               v0 := b.NewValue0(x1.Pos, OpAMD64ORQ, v.Type)
+                               v.copyOf(v0)
+                               v1 := b.NewValue0(x1.Pos, OpAMD64SHLQconst, v.Type)
+                               v1.AuxInt = j1
+                               v2 := b.NewValue0(x1.Pos, OpAMD64ROLWconst, typ.UInt16)
+                               v2.AuxInt = 8
+                               v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
+                               v3.AuxInt = i0
+                               v3.Aux = s
+                               v3.AddArg2(p, mem)
+                               v2.AddArg(v3)
+                               v1.AddArg(v2)
+                               v0.AddArg2(v1, y)
+                               return true
+                       }
+               }
+               break
+       }
+       // match: (ORQ s0:(SHLQconst [j0] x0:(MOVBload [i] {s} p0 mem)) or:(ORQ s1:(SHLQconst [j1] x1:(MOVBload [i] {s} p1 mem)) y))
+       // cond: j1 == j0-8 && j1 % 16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (ROLWconst <typ.UInt16> [8] (MOVWload [i] {s} p0 mem))) y)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       s0 := v_0
+                       if s0.Op != OpAMD64SHLQconst {
+                               continue
+                       }
+                       j0 := s0.AuxInt
+                       x0 := s0.Args[0]
+                       if x0.Op != OpAMD64MOVBload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       or := v_1
+                       if or.Op != OpAMD64ORQ {
+                               continue
+                       }
+                       _ = or.Args[1]
+                       or_0 := or.Args[0]
+                       or_1 := or.Args[1]
+                       for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+                               s1 := or_0
+                               if s1.Op != OpAMD64SHLQconst {
+                                       continue
+                               }
+                               j1 := s1.AuxInt
+                               x1 := s1.Args[0]
+                               if x1.Op != OpAMD64MOVBload || x1.AuxInt != i || x1.Aux != s {
+                                       continue
+                               }
+                               _ = x1.Args[1]
                                p1 := x1.Args[0]
                                if mem != x1.Args[1] {
                                        continue
                                }
                                y := or_1
-                               if !(i1 == i0+1 && j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
+                               if !(j1 == j0-8 && j1%16 == 0 && x0.Uses == 1 && x1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, s0, s1, or)) {
                                        continue
                                }
                                b = mergePoint(b, x0, x1, y)
@@ -19251,7 +19576,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                v2 := b.NewValue0(x1.Pos, OpAMD64ROLWconst, typ.UInt16)
                                v2.AuxInt = 8
                                v3 := b.NewValue0(x1.Pos, OpAMD64MOVWload, typ.UInt16)
-                               v3.AuxInt = i0
+                               v3.AuxInt = i
                                v3.Aux = s
                                v3.AddArg2(p0, mem)
                                v2.AddArg(v3)
@@ -19262,9 +19587,9 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                }
                break
        }
-       // match: (ORQ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p0 mem))) or:(ORQ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p1 mem))) y))
-       // cond: i1 == i0+2 && j1 == j0-16 && j1 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)
-       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p0 mem))) y)
+       // match: (ORQ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i0] {s} p mem))) or:(ORQ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i1] {s} p mem))) y))
+       // cond: i1 == i0+2 && j1 == j0-16 && j1 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i0] {s} p mem))) y)
        for {
                for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
                        s0 := v_0
@@ -19283,7 +19608,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                        i0 := x0.AuxInt
                        s := x0.Aux
                        mem := x0.Args[1]
-                       p0 := x0.Args[0]
+                       p := x0.Args[0]
                        or := v_1
                        if or.Op != OpAMD64ORQ {
                                continue
@@ -19310,12 +19635,81 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                        continue
                                }
                                _ = x1.Args[1]
+                               if p != x1.Args[0] || mem != x1.Args[1] {
+                                       continue
+                               }
+                               y := or_1
+                               if !(i1 == i0+2 && j1 == j0-16 && j1%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)) {
+                                       continue
+                               }
+                               b = mergePoint(b, x0, x1, y)
+                               v0 := b.NewValue0(x1.Pos, OpAMD64ORQ, v.Type)
+                               v.copyOf(v0)
+                               v1 := b.NewValue0(x1.Pos, OpAMD64SHLQconst, v.Type)
+                               v1.AuxInt = j1
+                               v2 := b.NewValue0(x1.Pos, OpAMD64BSWAPL, typ.UInt32)
+                               v3 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
+                               v3.AuxInt = i0
+                               v3.Aux = s
+                               v3.AddArg2(p, mem)
+                               v2.AddArg(v3)
+                               v1.AddArg(v2)
+                               v0.AddArg2(v1, y)
+                               return true
+                       }
+               }
+               break
+       }
+       // match: (ORQ s0:(SHLQconst [j0] r0:(ROLWconst [8] x0:(MOVWload [i] {s} p0 mem))) or:(ORQ s1:(SHLQconst [j1] r1:(ROLWconst [8] x1:(MOVWload [i] {s} p1 mem))) y))
+       // cond: j1 == j0-16 && j1 % 32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b,x0,x1,y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)
+       // result: @mergePoint(b,x0,x1,y) (ORQ <v.Type> (SHLQconst <v.Type> [j1] (BSWAPL <typ.UInt32> (MOVLload [i] {s} p0 mem))) y)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       s0 := v_0
+                       if s0.Op != OpAMD64SHLQconst {
+                               continue
+                       }
+                       j0 := s0.AuxInt
+                       r0 := s0.Args[0]
+                       if r0.Op != OpAMD64ROLWconst || r0.AuxInt != 8 {
+                               continue
+                       }
+                       x0 := r0.Args[0]
+                       if x0.Op != OpAMD64MOVWload {
+                               continue
+                       }
+                       i := x0.AuxInt
+                       s := x0.Aux
+                       mem := x0.Args[1]
+                       p0 := x0.Args[0]
+                       or := v_1
+                       if or.Op != OpAMD64ORQ {
+                               continue
+                       }
+                       _ = or.Args[1]
+                       or_0 := or.Args[0]
+                       or_1 := or.Args[1]
+                       for _i1 := 0; _i1 <= 1; _i1, or_0, or_1 = _i1+1, or_1, or_0 {
+                               s1 := or_0
+                               if s1.Op != OpAMD64SHLQconst {
+                                       continue
+                               }
+                               j1 := s1.AuxInt
+                               r1 := s1.Args[0]
+                               if r1.Op != OpAMD64ROLWconst || r1.AuxInt != 8 {
+                                       continue
+                               }
+                               x1 := r1.Args[0]
+                               if x1.Op != OpAMD64MOVWload || x1.AuxInt != i || x1.Aux != s {
+                                       continue
+                               }
+                               _ = x1.Args[1]
                                p1 := x1.Args[0]
                                if mem != x1.Args[1] {
                                        continue
                                }
                                y := or_1
-                               if !(i1 == i0+2 && j1 == j0-16 && j1%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && same(p0, p1, 1) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)) {
+                               if !(j1 == j0-16 && j1%32 == 0 && x0.Uses == 1 && x1.Uses == 1 && r0.Uses == 1 && r1.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && or.Uses == 1 && sequentialAddresses(p0, p1, 2) && mergePoint(b, x0, x1, y) != nil && clobber(x0, x1, r0, r1, s0, s1, or)) {
                                        continue
                                }
                                b = mergePoint(b, x0, x1, y)
@@ -19325,7 +19719,7 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                                v1.AuxInt = j1
                                v2 := b.NewValue0(x1.Pos, OpAMD64BSWAPL, typ.UInt32)
                                v3 := b.NewValue0(x1.Pos, OpAMD64MOVLload, typ.UInt32)
-                               v3.AuxInt = i0
+                               v3.AuxInt = i
                                v3.Aux = s
                                v3.AddArg2(p0, mem)
                                v2.AddArg(v3)
index e2d703cb0cfb87a65e0a3edff099e577fc4390ea..6ad9514557e171c398e246396ec54c200eaa8baa 100644 (file)
@@ -160,14 +160,14 @@ func load_le_byte8_uint64_inv(s []byte) uint64 {
 
 func load_be_byte2_uint16(s []byte) uint16 {
        // arm64:`MOVHU\t\(R[0-9]+\)`,`REV16W`,-`ORR`,-`MOVB`
-       // amd64:`MOVWLZX\s\([A-Z]+\)`,-`MOVB`,-`OR`
+       // amd64:`MOVWLZX\s\([A-Z]+\)`,`ROLW`,-`MOVB`,-`OR`
        // ppc64le:`MOVHBR\t\(R[0-9]+\)`,-`MOVBZ`
        return uint16(s[0])<<8 | uint16(s[1])
 }
 
 func load_be_byte2_uint16_inv(s []byte) uint16 {
        // arm64:`MOVHU\t\(R[0-9]+\)`,`REV16W`,-`ORR`,-`MOVB`
-       // amd64:`MOVWLZX\s\([A-Z]+\)`,-`MOVB`,-`OR`
+       // amd64:`MOVWLZX\s\([A-Z]+\)`,`ROLW`,-`MOVB`,-`OR`
        // ppc64le:`MOVHBR\t\(R[0-9]+\)`,-`MOVBZ`
        return uint16(s[1]) | uint16(s[0])<<8
 }
@@ -179,7 +179,7 @@ func load_be_byte4_uint32(s []byte) uint32 {
 
 func load_be_byte4_uint32_inv(s []byte) uint32 {
        // arm64:`MOVWU\t\(R[0-9]+\)`,`REVW`,-`ORR`,-`REV16W`,-`MOV[BH]`
-       // amd64:`MOVL\s\([A-Z]+\)`,-`MOV[BW]`,-`OR`
+       // amd64:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR`
        return uint32(s[3]) | uint32(s[2])<<8 | uint32(s[1])<<16 | uint32(s[0])<<24
 }
 
@@ -191,7 +191,7 @@ func load_be_byte8_uint64(s []byte) uint64 {
 
 func load_be_byte8_uint64_inv(s []byte) uint64 {
        // arm64:`MOVD\t\(R[0-9]+\)`,`REV`,-`ORR`,-`REVW`,-`REV16W`,-`MOV[BHW]`
-       // amd64:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,-`MOV[BWL]\t[^$]`,-`OR`
+       // amd64:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
        // ppc64le:`MOVDBR\t\(R[0-9]+\)`,-`MOV[BHW]Z`
        return uint64(s[7]) | uint64(s[6])<<8 | uint64(s[5])<<16 | uint64(s[4])<<24 | uint64(s[3])<<32 | uint64(s[2])<<40 | uint64(s[1])<<48 | uint64(s[0])<<56
 }