]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile/internal/ssa: generate bswap on AMD64
authorIlya Tocar <ilya.tocar@intel.com>
Thu, 27 Oct 2016 13:58:45 +0000 (16:58 +0300)
committerIlya Tocar <ilya.tocar@intel.com>
Thu, 3 Nov 2016 12:34:12 +0000 (12:34 +0000)
Generate bswap+load/store for reading/writing big endian data.
Helps encoding/binary.

name                    old time/op    new time/op    delta
ReadSlice1000Int32s-8     5.06µs ± 8%    4.58µs ± 8%   -9.50%        (p=0.000 n=10+10)
ReadStruct-8              1.07µs ± 0%    1.05µs ± 0%   -1.51%         (p=0.000 n=9+10)
ReadInts-8                 367ns ± 0%     363ns ± 0%   -1.15%          (p=0.000 n=8+9)
WriteInts-8                475ns ± 1%     469ns ± 0%   -1.45%        (p=0.000 n=10+10)
WriteSlice1000Int32s-8    5.03µs ± 3%    4.50µs ± 3%  -10.45%          (p=0.000 n=9+9)
PutUvarint32-8            17.2ns ± 0%    17.2ns ± 0%     ~     (all samples are equal)
PutUvarint64-8            46.7ns ± 0%    46.7ns ± 0%     ~           (p=0.509 n=10+10)

name                    old speed      new speed      delta
ReadSlice1000Int32s-8    791MB/s ± 8%   875MB/s ± 8%  +10.53%        (p=0.000 n=10+10)
ReadStruct-8            70.0MB/s ± 0%  71.1MB/s ± 0%   +1.54%         (p=0.000 n=9+10)
ReadInts-8              81.6MB/s ± 0%  82.6MB/s ± 0%   +1.21%          (p=0.000 n=9+9)
WriteInts-8             63.0MB/s ± 1%  63.9MB/s ± 0%   +1.45%        (p=0.000 n=10+10)
WriteSlice1000Int32s-8   796MB/s ± 4%   888MB/s ± 3%  +11.65%          (p=0.000 n=9+9)
PutUvarint32-8           233MB/s ± 0%   233MB/s ± 0%     ~           (p=0.089 n=10+10)
PutUvarint64-8           171MB/s ± 0%   171MB/s ± 0%     ~            (p=0.137 n=10+9)

Change-Id: Ia2dbdef92198eaa7e2af5443a8ed586d4b401ffb
Reviewed-on: https://go-review.googlesource.com/32222
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/gc/asm_test.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/rewriteAMD64.go

index 58cdb9da7d75d31e3bda6b6b9d366d161591e02c..2e5d7e748822dc31b7d099aeb9ccc0a56de301a1 100644 (file)
@@ -157,6 +157,38 @@ func f(b []byte, i int) uint32 {
 `,
                []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
        },
+       {"amd64", "linux", `
+import "encoding/binary"
+func f(b []byte) uint64 {
+       return binary.BigEndian.Uint64(b)
+}
+`,
+               []string{"\tBSWAPQ\t"},
+       },
+       {"amd64", "linux", `
+import "encoding/binary"
+func f(b []byte, i int) uint64 {
+       return binary.BigEndian.Uint64(b[i:])
+}
+`,
+               []string{"\tBSWAPQ\t"},
+       },
+       {"amd64", "linux", `
+import "encoding/binary"
+func f(b []byte) uint32 {
+       return binary.BigEndian.Uint32(b)
+}
+`,
+               []string{"\tBSWAPL\t"},
+       },
+       {"amd64", "linux", `
+import "encoding/binary"
+func f(b []byte, i int) uint32 {
+       return binary.BigEndian.Uint32(b[i:])
+}
+`,
+               []string{"\tBSWAPL\t"},
+       },
        {"386", "linux", `
 import "encoding/binary"
 func f(b []byte) uint32 {
index 4c49d109246a18757372331b8763051a62503d2b..5b4649cb143cd9d0fc72e46e65924727f1953eca 100644 (file)
   && clobber(o5)
   -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVQloadidx1 <v.Type> [i] {s} p idx mem)
 
+// Combine byte loads + shifts into larger (unaligned) loads + bswap
+(ORL o1:(ORL o0:(ORL
+                       x0:(MOVBload [i] {s} p mem)
+    s0:(SHLLconst [8]  x1:(MOVBload [i-1] {s} p mem)))
+    s1:(SHLLconst [16] x2:(MOVBload [i-2] {s} p mem)))
+    s2:(SHLLconst [24] x3:(MOVBload [i-3] {s} p mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && s2.Uses == 1
+  && o0.Uses == 1
+  && o1.Uses == 1
+  && mergePoint(b,x0,x1,x2,x3) != nil
+  && clobber(x0)
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(x3)
+  && clobber(s0)
+  && clobber(s1)
+  && clobber(s2)
+  && clobber(o0)
+  && clobber(o1)
+  -> @mergePoint(b,x0,x1,x2,x3) (BSWAPL <v.Type> (MOVLload [i-3] {s} p mem))
+
+(ORL o1:(ORL o0:(ORL
+                       x0:(MOVBloadidx1 [i] {s} p idx mem)
+    s0:(SHLLconst [8]  x1:(MOVBloadidx1 [i-1] {s} p idx mem)))
+    s1:(SHLLconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem)))
+    s2:(SHLLconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && s2.Uses == 1
+  && o0.Uses == 1
+  && o1.Uses == 1
+  && mergePoint(b,x0,x1,x2,x3) != nil
+  && clobber(x0)
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(x3)
+  && clobber(s0)
+  && clobber(s1)
+  && clobber(s2)
+  && clobber(o0)
+  && clobber(o1)
+  -> @mergePoint(b,x0,x1,x2,x3) (BSWAPL <v.Type> (MOVLloadidx1 <v.Type> [i-3] {s} p idx mem))
+
+(ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ
+                       x0:(MOVBload [i] {s} p mem)
+    s0:(SHLQconst [8]  x1:(MOVBload [i-1] {s} p mem)))
+    s1:(SHLQconst [16] x2:(MOVBload [i-2] {s} p mem)))
+    s2:(SHLQconst [24] x3:(MOVBload [i-3] {s} p mem)))
+    s3:(SHLQconst [32] x4:(MOVBload [i-4] {s} p mem)))
+    s4:(SHLQconst [40] x5:(MOVBload [i-5] {s} p mem)))
+    s5:(SHLQconst [48] x6:(MOVBload [i-6] {s} p mem)))
+    s6:(SHLQconst [56] x7:(MOVBload [i-7] {s} p mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && x4.Uses == 1
+  && x5.Uses == 1
+  && x6.Uses == 1
+  && x7.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && s2.Uses == 1
+  && s3.Uses == 1
+  && s4.Uses == 1
+  && s5.Uses == 1
+  && s6.Uses == 1
+  && o0.Uses == 1
+  && o1.Uses == 1
+  && o2.Uses == 1
+  && o3.Uses == 1
+  && o4.Uses == 1
+  && o5.Uses == 1
+  && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+  && clobber(x0)
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(x3)
+  && clobber(x4)
+  && clobber(x5)
+  && clobber(x6)
+  && clobber(x7)
+  && clobber(s0)
+  && clobber(s1)
+  && clobber(s2)
+  && clobber(s3)
+  && clobber(s4)
+  && clobber(s5)
+  && clobber(s6)
+  && clobber(o0)
+  && clobber(o1)
+  && clobber(o2)
+  && clobber(o3)
+  && clobber(o4)
+  && clobber(o5)
+  -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ <v.Type> (MOVQload [i-7] {s} p mem))
+
+(ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ
+                       x0:(MOVBloadidx1 [i] {s} p idx mem)
+    s0:(SHLQconst [8]  x1:(MOVBloadidx1 [i-1] {s} p idx mem)))
+    s1:(SHLQconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem)))
+    s2:(SHLQconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem)))
+    s3:(SHLQconst [32] x4:(MOVBloadidx1 [i-4] {s} p idx mem)))
+    s4:(SHLQconst [40] x5:(MOVBloadidx1 [i-5] {s} p idx mem)))
+    s5:(SHLQconst [48] x6:(MOVBloadidx1 [i-6] {s} p idx mem)))
+    s6:(SHLQconst [56] x7:(MOVBloadidx1 [i-7] {s} p idx mem)))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && x4.Uses == 1
+  && x5.Uses == 1
+  && x6.Uses == 1
+  && x7.Uses == 1
+  && s0.Uses == 1
+  && s1.Uses == 1
+  && s2.Uses == 1
+  && s3.Uses == 1
+  && s4.Uses == 1
+  && s5.Uses == 1
+  && s6.Uses == 1
+  && o0.Uses == 1
+  && o1.Uses == 1
+  && o2.Uses == 1
+  && o3.Uses == 1
+  && o4.Uses == 1
+  && o5.Uses == 1
+  && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
+  && clobber(x0)
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(x3)
+  && clobber(x4)
+  && clobber(x5)
+  && clobber(x6)
+  && clobber(x7)
+  && clobber(s0)
+  && clobber(s1)
+  && clobber(s2)
+  && clobber(s3)
+  && clobber(s4)
+  && clobber(s5)
+  && clobber(s6)
+  && clobber(o0)
+  && clobber(o1)
+  && clobber(o2)
+  && clobber(o3)
+  && clobber(o4)
+  && clobber(o5)
+  -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ <v.Type> (MOVQloadidx1 <v.Type> [i-7] {s} p idx mem))
+
+// Combine stores + shifts into bswap and larger (unaligned) stores
+(MOVBstore [i] {s} p w
+  x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w)
+  x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w)
+  x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && clobber(x0)
+  && clobber(x1)
+  && clobber(x2)
+  -> (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
+
+(MOVBstore [i] {s} p w
+  x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w)
+  x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w)
+  x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w)
+  x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w)
+  x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w)
+  x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w)
+  x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
+  && x0.Uses == 1
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && x3.Uses == 1
+  && x4.Uses == 1
+  && x5.Uses == 1
+  && x6.Uses == 1
+  && clobber(x0)
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(x3)
+  && clobber(x4)
+  && clobber(x5)
+  && clobber(x6)
+  -> (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
+
 // Combine constant stores into larger (unaligned) stores.
 (MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
   && x.Uses == 1
index 5c685ef25fe4188906344069563fcc26bcecb21b..1257ec6e7c37c81256bb091ddd692acf775319d4 100644 (file)
@@ -3928,6 +3928,280 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value, config *Config) bool {
                v.AddArg(mem)
                return true
        }
+       // match: (MOVBstore [i] {s} p w   x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w)   x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w)   x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
+       // cond: x0.Uses == 1   && x1.Uses == 1   && x2.Uses == 1   && clobber(x0)   && clobber(x1)   && clobber(x2)
+       // result: (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v.Args[0]
+               w := v.Args[1]
+               x2 := v.Args[2]
+               if x2.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x2.AuxInt != i-1 {
+                       break
+               }
+               if x2.Aux != s {
+                       break
+               }
+               if p != x2.Args[0] {
+                       break
+               }
+               x2_1 := x2.Args[1]
+               if x2_1.Op != OpAMD64SHRLconst {
+                       break
+               }
+               if x2_1.AuxInt != 8 {
+                       break
+               }
+               if w != x2_1.Args[0] {
+                       break
+               }
+               x1 := x2.Args[2]
+               if x1.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x1.AuxInt != i-2 {
+                       break
+               }
+               if x1.Aux != s {
+                       break
+               }
+               if p != x1.Args[0] {
+                       break
+               }
+               x1_1 := x1.Args[1]
+               if x1_1.Op != OpAMD64SHRLconst {
+                       break
+               }
+               if x1_1.AuxInt != 16 {
+                       break
+               }
+               if w != x1_1.Args[0] {
+                       break
+               }
+               x0 := x1.Args[2]
+               if x0.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x0.AuxInt != i-3 {
+                       break
+               }
+               if x0.Aux != s {
+                       break
+               }
+               if p != x0.Args[0] {
+                       break
+               }
+               x0_1 := x0.Args[1]
+               if x0_1.Op != OpAMD64SHRLconst {
+                       break
+               }
+               if x0_1.AuxInt != 24 {
+                       break
+               }
+               if w != x0_1.Args[0] {
+                       break
+               }
+               mem := x0.Args[2]
+               if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && clobber(x0) && clobber(x1) && clobber(x2)) {
+                       break
+               }
+               v.reset(OpAMD64MOVLstore)
+               v.AuxInt = i - 3
+               v.Aux = s
+               v.AddArg(p)
+               v0 := b.NewValue0(v.Line, OpAMD64BSWAPL, w.Type)
+               v0.AddArg(w)
+               v.AddArg(v0)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (MOVBstore [i] {s} p w   x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w)   x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w)   x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w)   x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w)   x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w)   x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w)   x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
+       // cond: x0.Uses == 1   && x1.Uses == 1   && x2.Uses == 1   && x3.Uses == 1   && x4.Uses == 1   && x5.Uses == 1   && x6.Uses == 1   && clobber(x0)   && clobber(x1)   && clobber(x2)   && clobber(x3)   && clobber(x4)   && clobber(x5)   && clobber(x6)
+       // result: (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               p := v.Args[0]
+               w := v.Args[1]
+               x6 := v.Args[2]
+               if x6.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x6.AuxInt != i-1 {
+                       break
+               }
+               if x6.Aux != s {
+                       break
+               }
+               if p != x6.Args[0] {
+                       break
+               }
+               x6_1 := x6.Args[1]
+               if x6_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               if x6_1.AuxInt != 8 {
+                       break
+               }
+               if w != x6_1.Args[0] {
+                       break
+               }
+               x5 := x6.Args[2]
+               if x5.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x5.AuxInt != i-2 {
+                       break
+               }
+               if x5.Aux != s {
+                       break
+               }
+               if p != x5.Args[0] {
+                       break
+               }
+               x5_1 := x5.Args[1]
+               if x5_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               if x5_1.AuxInt != 16 {
+                       break
+               }
+               if w != x5_1.Args[0] {
+                       break
+               }
+               x4 := x5.Args[2]
+               if x4.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x4.AuxInt != i-3 {
+                       break
+               }
+               if x4.Aux != s {
+                       break
+               }
+               if p != x4.Args[0] {
+                       break
+               }
+               x4_1 := x4.Args[1]
+               if x4_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               if x4_1.AuxInt != 24 {
+                       break
+               }
+               if w != x4_1.Args[0] {
+                       break
+               }
+               x3 := x4.Args[2]
+               if x3.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x3.AuxInt != i-4 {
+                       break
+               }
+               if x3.Aux != s {
+                       break
+               }
+               if p != x3.Args[0] {
+                       break
+               }
+               x3_1 := x3.Args[1]
+               if x3_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               if x3_1.AuxInt != 32 {
+                       break
+               }
+               if w != x3_1.Args[0] {
+                       break
+               }
+               x2 := x3.Args[2]
+               if x2.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x2.AuxInt != i-5 {
+                       break
+               }
+               if x2.Aux != s {
+                       break
+               }
+               if p != x2.Args[0] {
+                       break
+               }
+               x2_1 := x2.Args[1]
+               if x2_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               if x2_1.AuxInt != 40 {
+                       break
+               }
+               if w != x2_1.Args[0] {
+                       break
+               }
+               x1 := x2.Args[2]
+               if x1.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x1.AuxInt != i-6 {
+                       break
+               }
+               if x1.Aux != s {
+                       break
+               }
+               if p != x1.Args[0] {
+                       break
+               }
+               x1_1 := x1.Args[1]
+               if x1_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               if x1_1.AuxInt != 48 {
+                       break
+               }
+               if w != x1_1.Args[0] {
+                       break
+               }
+               x0 := x1.Args[2]
+               if x0.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if x0.AuxInt != i-7 {
+                       break
+               }
+               if x0.Aux != s {
+                       break
+               }
+               if p != x0.Args[0] {
+                       break
+               }
+               x0_1 := x0.Args[1]
+               if x0_1.Op != OpAMD64SHRQconst {
+                       break
+               }
+               if x0_1.AuxInt != 56 {
+                       break
+               }
+               if w != x0_1.Args[0] {
+                       break
+               }
+               mem := x0.Args[2]
+               if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6)) {
+                       break
+               }
+               v.reset(OpAMD64MOVQstore)
+               v.AuxInt = i - 7
+               v.Aux = s
+               v.AddArg(p)
+               v0 := b.NewValue0(v.Line, OpAMD64BSWAPQ, w.Type)
+               v0.AddArg(w)
+               v.AddArg(v0)
+               v.AddArg(mem)
+               return true
+       }
        // match: (MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
        // cond: x.Uses == 1   && clobber(x)
        // result: (MOVWstore [i-1] {s} p w mem)
@@ -10881,79 +11155,298 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value, config *Config) bool {
                v0.AddArg(mem)
                return true
        }
-       return false
-}
-func rewriteValueAMD64_OpAMD64ORLconst(v *Value, config *Config) bool {
-       b := v.Block
-       _ = b
-       // match: (ORLconst [c] x)
-       // cond: int32(c)==0
-       // result: x
+       // match: (ORL o1:(ORL o0:(ORL                        x0:(MOVBload [i] {s} p mem)     s0:(SHLLconst [8]  x1:(MOVBload [i-1] {s} p mem)))     s1:(SHLLconst [16] x2:(MOVBload [i-2] {s} p mem)))     s2:(SHLLconst [24] x3:(MOVBload [i-3] {s} p mem)))
+       // cond: x0.Uses == 1   && x1.Uses == 1   && x2.Uses == 1   && x3.Uses == 1   && s0.Uses == 1   && s1.Uses == 1   && s2.Uses == 1   && o0.Uses == 1   && o1.Uses == 1   && mergePoint(b,x0,x1,x2,x3) != nil   && clobber(x0)   && clobber(x1)   && clobber(x2)   && clobber(x3)   && clobber(s0)   && clobber(s1)   && clobber(s2)   && clobber(o0)   && clobber(o1)
+       // result: @mergePoint(b,x0,x1,x2,x3) (BSWAPL <v.Type> (MOVLload [i-3] {s} p mem))
        for {
-               c := v.AuxInt
-               x := v.Args[0]
-               if !(int32(c) == 0) {
+               o1 := v.Args[0]
+               if o1.Op != OpAMD64ORL {
                        break
                }
-               v.reset(OpCopy)
-               v.Type = x.Type
-               v.AddArg(x)
-               return true
-       }
-       // match: (ORLconst [c] _)
-       // cond: int32(c)==-1
-       // result: (MOVLconst [-1])
-       for {
-               c := v.AuxInt
-               if !(int32(c) == -1) {
+               o0 := o1.Args[0]
+               if o0.Op != OpAMD64ORL {
                        break
                }
-               v.reset(OpAMD64MOVLconst)
-               v.AuxInt = -1
-               return true
-       }
-       // match: (ORLconst [c] (MOVLconst [d]))
-       // cond:
-       // result: (MOVLconst [c|d])
-       for {
-               c := v.AuxInt
-               v_0 := v.Args[0]
-               if v_0.Op != OpAMD64MOVLconst {
+               x0 := o0.Args[0]
+               if x0.Op != OpAMD64MOVBload {
                        break
                }
-               d := v_0.AuxInt
-               v.reset(OpAMD64MOVLconst)
-               v.AuxInt = c | d
-               return true
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64ORQ(v *Value, config *Config) bool {
-       b := v.Block
-       _ = b
-       // match: (ORQ x (MOVQconst [c]))
-       // cond: is32Bit(c)
-       // result: (ORQconst [c] x)
-       for {
-               x := v.Args[0]
-               v_1 := v.Args[1]
-               if v_1.Op != OpAMD64MOVQconst {
+               i := x0.AuxInt
+               s := x0.Aux
+               p := x0.Args[0]
+               mem := x0.Args[1]
+               s0 := o0.Args[1]
+               if s0.Op != OpAMD64SHLLconst {
                        break
                }
-               c := v_1.AuxInt
-               if !(is32Bit(c)) {
+               if s0.AuxInt != 8 {
                        break
                }
-               v.reset(OpAMD64ORQconst)
-               v.AuxInt = c
-               v.AddArg(x)
-               return true
-       }
-       // match: (ORQ (MOVQconst [c]) x)
-       // cond: is32Bit(c)
-       // result: (ORQconst [c] x)
-       for {
-               v_0 := v.Args[0]
+               x1 := s0.Args[0]
+               if x1.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x1.AuxInt != i-1 {
+                       break
+               }
+               if x1.Aux != s {
+                       break
+               }
+               if p != x1.Args[0] {
+                       break
+               }
+               if mem != x1.Args[1] {
+                       break
+               }
+               s1 := o1.Args[1]
+               if s1.Op != OpAMD64SHLLconst {
+                       break
+               }
+               if s1.AuxInt != 16 {
+                       break
+               }
+               x2 := s1.Args[0]
+               if x2.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x2.AuxInt != i-2 {
+                       break
+               }
+               if x2.Aux != s {
+                       break
+               }
+               if p != x2.Args[0] {
+                       break
+               }
+               if mem != x2.Args[1] {
+                       break
+               }
+               s2 := v.Args[1]
+               if s2.Op != OpAMD64SHLLconst {
+                       break
+               }
+               if s2.AuxInt != 24 {
+                       break
+               }
+               x3 := s2.Args[0]
+               if x3.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x3.AuxInt != i-3 {
+                       break
+               }
+               if x3.Aux != s {
+                       break
+               }
+               if p != x3.Args[0] {
+                       break
+               }
+               if mem != x3.Args[1] {
+                       break
+               }
+               if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && mergePoint(b, x0, x1, x2, x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(o0) && clobber(o1)) {
+                       break
+               }
+               b = mergePoint(b, x0, x1, x2, x3)
+               v0 := b.NewValue0(v.Line, OpAMD64BSWAPL, v.Type)
+               v.reset(OpCopy)
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Line, OpAMD64MOVLload, config.fe.TypeUInt32())
+               v1.AuxInt = i - 3
+               v1.Aux = s
+               v1.AddArg(p)
+               v1.AddArg(mem)
+               v0.AddArg(v1)
+               return true
+       }
+       // match: (ORL o1:(ORL o0:(ORL                        x0:(MOVBloadidx1 [i] {s} p idx mem)     s0:(SHLLconst [8]  x1:(MOVBloadidx1 [i-1] {s} p idx mem)))     s1:(SHLLconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem)))     s2:(SHLLconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem)))
+       // cond: x0.Uses == 1   && x1.Uses == 1   && x2.Uses == 1   && x3.Uses == 1   && s0.Uses == 1   && s1.Uses == 1   && s2.Uses == 1   && o0.Uses == 1   && o1.Uses == 1   && mergePoint(b,x0,x1,x2,x3) != nil   && clobber(x0)   && clobber(x1)   && clobber(x2)   && clobber(x3)   && clobber(s0)   && clobber(s1)   && clobber(s2)   && clobber(o0)   && clobber(o1)
+       // result: @mergePoint(b,x0,x1,x2,x3) (BSWAPL <v.Type> (MOVLloadidx1 <v.Type> [i-3] {s} p idx mem))
+       for {
+               o1 := v.Args[0]
+               if o1.Op != OpAMD64ORL {
+                       break
+               }
+               o0 := o1.Args[0]
+               if o0.Op != OpAMD64ORL {
+                       break
+               }
+               x0 := o0.Args[0]
+               if x0.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               i := x0.AuxInt
+               s := x0.Aux
+               p := x0.Args[0]
+               idx := x0.Args[1]
+               mem := x0.Args[2]
+               s0 := o0.Args[1]
+               if s0.Op != OpAMD64SHLLconst {
+                       break
+               }
+               if s0.AuxInt != 8 {
+                       break
+               }
+               x1 := s0.Args[0]
+               if x1.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x1.AuxInt != i-1 {
+                       break
+               }
+               if x1.Aux != s {
+                       break
+               }
+               if p != x1.Args[0] {
+                       break
+               }
+               if idx != x1.Args[1] {
+                       break
+               }
+               if mem != x1.Args[2] {
+                       break
+               }
+               s1 := o1.Args[1]
+               if s1.Op != OpAMD64SHLLconst {
+                       break
+               }
+               if s1.AuxInt != 16 {
+                       break
+               }
+               x2 := s1.Args[0]
+               if x2.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x2.AuxInt != i-2 {
+                       break
+               }
+               if x2.Aux != s {
+                       break
+               }
+               if p != x2.Args[0] {
+                       break
+               }
+               if idx != x2.Args[1] {
+                       break
+               }
+               if mem != x2.Args[2] {
+                       break
+               }
+               s2 := v.Args[1]
+               if s2.Op != OpAMD64SHLLconst {
+                       break
+               }
+               if s2.AuxInt != 24 {
+                       break
+               }
+               x3 := s2.Args[0]
+               if x3.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x3.AuxInt != i-3 {
+                       break
+               }
+               if x3.Aux != s {
+                       break
+               }
+               if p != x3.Args[0] {
+                       break
+               }
+               if idx != x3.Args[1] {
+                       break
+               }
+               if mem != x3.Args[2] {
+                       break
+               }
+               if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && mergePoint(b, x0, x1, x2, x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(o0) && clobber(o1)) {
+                       break
+               }
+               b = mergePoint(b, x0, x1, x2, x3)
+               v0 := b.NewValue0(v.Line, OpAMD64BSWAPL, v.Type)
+               v.reset(OpCopy)
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Line, OpAMD64MOVLloadidx1, v.Type)
+               v1.AuxInt = i - 3
+               v1.Aux = s
+               v1.AddArg(p)
+               v1.AddArg(idx)
+               v1.AddArg(mem)
+               v0.AddArg(v1)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64ORLconst(v *Value, config *Config) bool {
+       b := v.Block
+       _ = b
+       // match: (ORLconst [c] x)
+       // cond: int32(c)==0
+       // result: x
+       for {
+               c := v.AuxInt
+               x := v.Args[0]
+               if !(int32(c) == 0) {
+                       break
+               }
+               v.reset(OpCopy)
+               v.Type = x.Type
+               v.AddArg(x)
+               return true
+       }
+       // match: (ORLconst [c] _)
+       // cond: int32(c)==-1
+       // result: (MOVLconst [-1])
+       for {
+               c := v.AuxInt
+               if !(int32(c) == -1) {
+                       break
+               }
+               v.reset(OpAMD64MOVLconst)
+               v.AuxInt = -1
+               return true
+       }
+       // match: (ORLconst [c] (MOVLconst [d]))
+       // cond:
+       // result: (MOVLconst [c|d])
+       for {
+               c := v.AuxInt
+               v_0 := v.Args[0]
+               if v_0.Op != OpAMD64MOVLconst {
+                       break
+               }
+               d := v_0.AuxInt
+               v.reset(OpAMD64MOVLconst)
+               v.AuxInt = c | d
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64ORQ(v *Value, config *Config) bool {
+       b := v.Block
+       _ = b
+       // match: (ORQ x (MOVQconst [c]))
+       // cond: is32Bit(c)
+       // result: (ORQconst [c] x)
+       for {
+               x := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := v_1.AuxInt
+               if !(is32Bit(c)) {
+                       break
+               }
+               v.reset(OpAMD64ORQconst)
+               v.AuxInt = c
+               v.AddArg(x)
+               return true
+       }
+       // match: (ORQ (MOVQconst [c]) x)
+       // cond: is32Bit(c)
+       // result: (ORQconst [c] x)
+       for {
+               v_0 := v.Args[0]
                if v_0.Op != OpAMD64MOVQconst {
                        break
                }
@@ -11423,6 +11916,453 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value, config *Config) bool {
                v0.AddArg(mem)
                return true
        }
+       // match: (ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ                        x0:(MOVBload [i] {s} p mem)     s0:(SHLQconst [8]  x1:(MOVBload [i-1] {s} p mem)))     s1:(SHLQconst [16] x2:(MOVBload [i-2] {s} p mem)))     s2:(SHLQconst [24] x3:(MOVBload [i-3] {s} p mem)))     s3:(SHLQconst [32] x4:(MOVBload [i-4] {s} p mem)))     s4:(SHLQconst [40] x5:(MOVBload [i-5] {s} p mem)))     s5:(SHLQconst [48] x6:(MOVBload [i-6] {s} p mem)))     s6:(SHLQconst [56] x7:(MOVBload [i-7] {s} p mem)))
+       // cond: x0.Uses == 1   && x1.Uses == 1   && x2.Uses == 1   && x3.Uses == 1   && x4.Uses == 1   && x5.Uses == 1   && x6.Uses == 1   && x7.Uses == 1   && s0.Uses == 1   && s1.Uses == 1   && s2.Uses == 1   && s3.Uses == 1   && s4.Uses == 1   && s5.Uses == 1   && s6.Uses == 1   && o0.Uses == 1   && o1.Uses == 1   && o2.Uses == 1   && o3.Uses == 1   && o4.Uses == 1   && o5.Uses == 1   && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil   && clobber(x0)   && clobber(x1)   && clobber(x2)   && clobber(x3)   && clobber(x4)   && clobber(x5)   && clobber(x6)   && clobber(x7)   && clobber(s0)   && clobber(s1)   && clobber(s2)   && clobber(s3)   && clobber(s4)   && clobber(s5)   && clobber(s6)   && clobber(o0)   && clobber(o1)   && clobber(o2)   && clobber(o3)   && clobber(o4)   && clobber(o5)
+       // result: @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ <v.Type> (MOVQload [i-7] {s} p mem))
+       for {
+               o5 := v.Args[0]
+               if o5.Op != OpAMD64ORQ {
+                       break
+               }
+               o4 := o5.Args[0]
+               if o4.Op != OpAMD64ORQ {
+                       break
+               }
+               o3 := o4.Args[0]
+               if o3.Op != OpAMD64ORQ {
+                       break
+               }
+               o2 := o3.Args[0]
+               if o2.Op != OpAMD64ORQ {
+                       break
+               }
+               o1 := o2.Args[0]
+               if o1.Op != OpAMD64ORQ {
+                       break
+               }
+               o0 := o1.Args[0]
+               if o0.Op != OpAMD64ORQ {
+                       break
+               }
+               x0 := o0.Args[0]
+               if x0.Op != OpAMD64MOVBload {
+                       break
+               }
+               i := x0.AuxInt
+               s := x0.Aux
+               p := x0.Args[0]
+               mem := x0.Args[1]
+               s0 := o0.Args[1]
+               if s0.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s0.AuxInt != 8 {
+                       break
+               }
+               x1 := s0.Args[0]
+               if x1.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x1.AuxInt != i-1 {
+                       break
+               }
+               if x1.Aux != s {
+                       break
+               }
+               if p != x1.Args[0] {
+                       break
+               }
+               if mem != x1.Args[1] {
+                       break
+               }
+               s1 := o1.Args[1]
+               if s1.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s1.AuxInt != 16 {
+                       break
+               }
+               x2 := s1.Args[0]
+               if x2.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x2.AuxInt != i-2 {
+                       break
+               }
+               if x2.Aux != s {
+                       break
+               }
+               if p != x2.Args[0] {
+                       break
+               }
+               if mem != x2.Args[1] {
+                       break
+               }
+               s2 := o2.Args[1]
+               if s2.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s2.AuxInt != 24 {
+                       break
+               }
+               x3 := s2.Args[0]
+               if x3.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x3.AuxInt != i-3 {
+                       break
+               }
+               if x3.Aux != s {
+                       break
+               }
+               if p != x3.Args[0] {
+                       break
+               }
+               if mem != x3.Args[1] {
+                       break
+               }
+               s3 := o3.Args[1]
+               if s3.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s3.AuxInt != 32 {
+                       break
+               }
+               x4 := s3.Args[0]
+               if x4.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x4.AuxInt != i-4 {
+                       break
+               }
+               if x4.Aux != s {
+                       break
+               }
+               if p != x4.Args[0] {
+                       break
+               }
+               if mem != x4.Args[1] {
+                       break
+               }
+               s4 := o4.Args[1]
+               if s4.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s4.AuxInt != 40 {
+                       break
+               }
+               x5 := s4.Args[0]
+               if x5.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x5.AuxInt != i-5 {
+                       break
+               }
+               if x5.Aux != s {
+                       break
+               }
+               if p != x5.Args[0] {
+                       break
+               }
+               if mem != x5.Args[1] {
+                       break
+               }
+               s5 := o5.Args[1]
+               if s5.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s5.AuxInt != 48 {
+                       break
+               }
+               x6 := s5.Args[0]
+               if x6.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x6.AuxInt != i-6 {
+                       break
+               }
+               if x6.Aux != s {
+                       break
+               }
+               if p != x6.Args[0] {
+                       break
+               }
+               if mem != x6.Args[1] {
+                       break
+               }
+               s6 := v.Args[1]
+               if s6.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s6.AuxInt != 56 {
+                       break
+               }
+               x7 := s6.Args[0]
+               if x7.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x7.AuxInt != i-7 {
+                       break
+               }
+               if x7.Aux != s {
+                       break
+               }
+               if p != x7.Args[0] {
+                       break
+               }
+               if mem != x7.Args[1] {
+                       break
+               }
+               if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 && mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3) && clobber(s4) && clobber(s5) && clobber(s6) && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5)) {
+                       break
+               }
+               b = mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7)
+               v0 := b.NewValue0(v.Line, OpAMD64BSWAPQ, v.Type)
+               v.reset(OpCopy)
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Line, OpAMD64MOVQload, config.fe.TypeUInt64())
+               v1.AuxInt = i - 7
+               v1.Aux = s
+               v1.AddArg(p)
+               v1.AddArg(mem)
+               v0.AddArg(v1)
+               return true
+       }
+       // match: (ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ                        x0:(MOVBloadidx1 [i] {s} p idx mem)     s0:(SHLQconst [8]  x1:(MOVBloadidx1 [i-1] {s} p idx mem)))     s1:(SHLQconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem)))     s2:(SHLQconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem)))     s3:(SHLQconst [32] x4:(MOVBloadidx1 [i-4] {s} p idx mem)))     s4:(SHLQconst [40] x5:(MOVBloadidx1 [i-5] {s} p idx mem)))     s5:(SHLQconst [48] x6:(MOVBloadidx1 [i-6] {s} p idx mem)))     s6:(SHLQconst [56] x7:(MOVBloadidx1 [i-7] {s} p idx mem)))
+       // cond: x0.Uses == 1   && x1.Uses == 1   && x2.Uses == 1   && x3.Uses == 1   && x4.Uses == 1   && x5.Uses == 1   && x6.Uses == 1   && x7.Uses == 1   && s0.Uses == 1   && s1.Uses == 1   && s2.Uses == 1   && s3.Uses == 1   && s4.Uses == 1   && s5.Uses == 1   && s6.Uses == 1   && o0.Uses == 1   && o1.Uses == 1   && o2.Uses == 1   && o3.Uses == 1   && o4.Uses == 1   && o5.Uses == 1   && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil   && clobber(x0)   && clobber(x1)   && clobber(x2)   && clobber(x3)   && clobber(x4)   && clobber(x5)   && clobber(x6)   && clobber(x7)   && clobber(s0)   && clobber(s1)   && clobber(s2)   && clobber(s3)   && clobber(s4)   && clobber(s5)   && clobber(s6)   && clobber(o0)   && clobber(o1)   && clobber(o2)   && clobber(o3)   && clobber(o4)   && clobber(o5)
+       // result: @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ <v.Type> (MOVQloadidx1 <v.Type> [i-7] {s} p idx mem))
+       for {
+               o5 := v.Args[0]
+               if o5.Op != OpAMD64ORQ {
+                       break
+               }
+               o4 := o5.Args[0]
+               if o4.Op != OpAMD64ORQ {
+                       break
+               }
+               o3 := o4.Args[0]
+               if o3.Op != OpAMD64ORQ {
+                       break
+               }
+               o2 := o3.Args[0]
+               if o2.Op != OpAMD64ORQ {
+                       break
+               }
+               o1 := o2.Args[0]
+               if o1.Op != OpAMD64ORQ {
+                       break
+               }
+               o0 := o1.Args[0]
+               if o0.Op != OpAMD64ORQ {
+                       break
+               }
+               x0 := o0.Args[0]
+               if x0.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               i := x0.AuxInt
+               s := x0.Aux
+               p := x0.Args[0]
+               idx := x0.Args[1]
+               mem := x0.Args[2]
+               s0 := o0.Args[1]
+               if s0.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s0.AuxInt != 8 {
+                       break
+               }
+               x1 := s0.Args[0]
+               if x1.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x1.AuxInt != i-1 {
+                       break
+               }
+               if x1.Aux != s {
+                       break
+               }
+               if p != x1.Args[0] {
+                       break
+               }
+               if idx != x1.Args[1] {
+                       break
+               }
+               if mem != x1.Args[2] {
+                       break
+               }
+               s1 := o1.Args[1]
+               if s1.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s1.AuxInt != 16 {
+                       break
+               }
+               x2 := s1.Args[0]
+               if x2.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x2.AuxInt != i-2 {
+                       break
+               }
+               if x2.Aux != s {
+                       break
+               }
+               if p != x2.Args[0] {
+                       break
+               }
+               if idx != x2.Args[1] {
+                       break
+               }
+               if mem != x2.Args[2] {
+                       break
+               }
+               s2 := o2.Args[1]
+               if s2.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s2.AuxInt != 24 {
+                       break
+               }
+               x3 := s2.Args[0]
+               if x3.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x3.AuxInt != i-3 {
+                       break
+               }
+               if x3.Aux != s {
+                       break
+               }
+               if p != x3.Args[0] {
+                       break
+               }
+               if idx != x3.Args[1] {
+                       break
+               }
+               if mem != x3.Args[2] {
+                       break
+               }
+               s3 := o3.Args[1]
+               if s3.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s3.AuxInt != 32 {
+                       break
+               }
+               x4 := s3.Args[0]
+               if x4.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x4.AuxInt != i-4 {
+                       break
+               }
+               if x4.Aux != s {
+                       break
+               }
+               if p != x4.Args[0] {
+                       break
+               }
+               if idx != x4.Args[1] {
+                       break
+               }
+               if mem != x4.Args[2] {
+                       break
+               }
+               s4 := o4.Args[1]
+               if s4.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s4.AuxInt != 40 {
+                       break
+               }
+               x5 := s4.Args[0]
+               if x5.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x5.AuxInt != i-5 {
+                       break
+               }
+               if x5.Aux != s {
+                       break
+               }
+               if p != x5.Args[0] {
+                       break
+               }
+               if idx != x5.Args[1] {
+                       break
+               }
+               if mem != x5.Args[2] {
+                       break
+               }
+               s5 := o5.Args[1]
+               if s5.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s5.AuxInt != 48 {
+                       break
+               }
+               x6 := s5.Args[0]
+               if x6.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x6.AuxInt != i-6 {
+                       break
+               }
+               if x6.Aux != s {
+                       break
+               }
+               if p != x6.Args[0] {
+                       break
+               }
+               if idx != x6.Args[1] {
+                       break
+               }
+               if mem != x6.Args[2] {
+                       break
+               }
+               s6 := v.Args[1]
+               if s6.Op != OpAMD64SHLQconst {
+                       break
+               }
+               if s6.AuxInt != 56 {
+                       break
+               }
+               x7 := s6.Args[0]
+               if x7.Op != OpAMD64MOVBloadidx1 {
+                       break
+               }
+               if x7.AuxInt != i-7 {
+                       break
+               }
+               if x7.Aux != s {
+                       break
+               }
+               if p != x7.Args[0] {
+                       break
+               }
+               if idx != x7.Args[1] {
+                       break
+               }
+               if mem != x7.Args[2] {
+                       break
+               }
+               if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 && mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3) && clobber(s4) && clobber(s5) && clobber(s6) && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5)) {
+                       break
+               }
+               b = mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7)
+               v0 := b.NewValue0(v.Line, OpAMD64BSWAPQ, v.Type)
+               v.reset(OpCopy)
+               v.AddArg(v0)
+               v1 := b.NewValue0(v.Line, OpAMD64MOVQloadidx1, v.Type)
+               v1.AuxInt = i - 7
+               v1.Aux = s
+               v1.AddArg(p)
+               v1.AddArg(idx)
+               v1.AddArg(mem)
+               v0.AddArg(v1)
+               return true
+       }
        return false
 }
 func rewriteValueAMD64_OpAMD64ORQconst(v *Value, config *Config) bool {