]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile/internal/ssa: combine consecutive loads and stores on amd64
authorIlya Tocar <ilya.tocar@intel.com>
Fri, 18 Aug 2017 19:03:33 +0000 (14:03 -0500)
committerIlya Tocar <ilya.tocar@intel.com>
Fri, 25 Aug 2017 20:05:17 +0000 (20:05 +0000)
Sometimes (often for calls) we generate code like this:

MOVQ  (addr),AX
MOVQ  8(addr),BX
MOVQ  AX,(otheraddr)
MOVQ  BX,8(otheraddr)

Replace it with

MOVUPS (addr),X0
MOVUPS X0,(otheraddr)

For completeness do the same for 8,16,32-bit loads/stores too.
Shaves 1% from code sections of go tool.

/localdisk/itocar/golang/bin/go 10293917
go_old 10334877 [40960 bytes]

read-only data = 682 bytes (0.040769%)
global text (code) = 38961 bytes (1.036503%)
Total difference 39643 bytes (0.674628%)

Updates #6853

Change-Id: I1f0d2f60273a63a079b58927cd1c4e3429d2e7ae
Reviewed-on: https://go-review.googlesource.com/57130
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/gc/asm_test.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/rewriteAMD64.go

index 08df053e8d0f1b9d2cc77063177cd073c06c3d14..bfd1f8a784d898b73d87d8349779739488a8835b 100644 (file)
@@ -24,9 +24,10 @@ import (
 // architecture-specific, and they are grouped in arrays of tests, one
 // for each architecture.
 //
-// Each asmTest consists in a function to be compiled and an array of
-// regexps that will be matched to the generated assembly. For
-// example, the following amd64 test
+// Each asmTest consists of a function to compile, an array of
+// positiveRegexps that will be matched to the generated assembly and
+// an array of negativeRegexps that must not match generated assembly.
+// For example, the following amd64 test
 //
 //   {
 //       `
@@ -35,10 +36,11 @@ import (
 //       }
 //       `,
 //       []string{"\tSHLQ\t[$]6,"},
+//       []string{"MULQ"}
 //   }
 //
 // verifies that the code the compiler generates for a multiplication
-// by 64 contains a 'SHLQ' instruction.
+// by 64 contains a 'SHLQ' instruction and does not contain a MULQ.
 //
 // Since all the tests for a given architecture are dumped in the same
 // file, the function names must be unique. As a workaround for this
@@ -52,6 +54,7 @@ import (
 //       }
 //       `,
 //       []string{"\tSHLQ\t[$]6,"},
+//       []string{"MULQ"}
 //   }
 //
 // Each '$'-function will be given a unique name of form f<N>_<arch>,
@@ -124,16 +127,22 @@ func funcAsm(t *testing.T, asm string, funcName string) string {
 type asmTest struct {
        // function to compile
        function string
-       // regexps that must match the generated assembly
-       regexps []string
+       // positiveRegexps that must match the generated assembly
+       positiveRegexps []string
+       negativeRegexps []string
 }
 
 func (at asmTest) verifyAsm(t *testing.T, fa string) {
-       for _, r := range at.regexps {
+       for _, r := range at.positiveRegexps {
                if b, err := regexp.MatchString(r, fa); !b || err != nil {
                        t.Errorf("expected:%s\ngo:%s\nasm:%s\n", r, at.function, fa)
                }
        }
+       for _, r := range at.negativeRegexps {
+               if b, err := regexp.MatchString(r, fa); b || err != nil {
+                       t.Errorf("not expected:%s\ngo:%s\nasm:%s\n", r, at.function, fa)
+               }
+       }
 }
 
 type asmTests struct {
@@ -214,7 +223,7 @@ var allAsmTests = []*asmTests{
        {
                arch:    "amd64",
                os:      "linux",
-               imports: []string{"encoding/binary", "math", "math/bits", "unsafe"},
+               imports: []string{"encoding/binary", "math", "math/bits", "unsafe", "runtime"},
                tests:   linuxAMD64Tests,
        },
        {
@@ -262,6 +271,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tSHLQ\t\\$6,"},
+               []string{},
        },
        {
                `
@@ -270,6 +280,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tSHLQ\t\\$5,", "\tLEAQ\t\\(.*\\)\\(.*\\*2\\),"},
+               []string{},
        },
        // Load-combining tests.
        {
@@ -279,6 +290,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVQ\t\\(.*\\),"},
+               []string{},
        },
        {
                `
@@ -287,6 +299,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVQ\t\\(.*\\)\\(.*\\*1\\),"},
+               []string{},
        },
        {
                `
@@ -295,6 +308,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVL\t\\(.*\\),"},
+               []string{},
        },
        {
                `
@@ -303,6 +317,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
+               []string{},
        },
        {
                `
@@ -311,6 +326,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPQ\t"},
+               []string{},
        },
        {
                `
@@ -319,6 +335,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPQ\t"},
+               []string{},
        },
        {
                `
@@ -327,6 +344,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPQ\t"},
+               []string{},
        },
        {
                `
@@ -335,6 +353,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPQ\t"},
+               []string{},
        },
        {
                `
@@ -343,6 +362,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPL\t"},
+               []string{},
        },
        {
                `
@@ -351,6 +371,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPL\t"},
+               []string{},
        },
        {
                `
@@ -359,6 +380,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPL\t"},
+               []string{},
        },
        {
                `
@@ -367,6 +389,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPL\t"},
+               []string{},
        },
        {
                `
@@ -375,6 +398,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t\\$8,"},
+               []string{},
        },
        {
                `
@@ -383,6 +407,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t\\$8,"},
+               []string{},
        },
        {
                `
@@ -391,6 +416,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t\\$8,"},
+               []string{},
        },
        {
                `
@@ -399,6 +425,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t\\$8,"},
+               []string{},
        },
        // Structure zeroing.  See issue #18370.
        {
@@ -411,6 +438,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)"},
+               []string{},
        },
        // SSA-able composite literal initialization. Issue 18872.
        {
@@ -424,6 +452,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVQ\t[$]1", "\tMOVQ\t[$]2", "\tMOVQ\t[$]3", "\tMOVQ\t[$]4"},
+               []string{},
        },
        // Also test struct containing pointers (this was special because of write barriers).
        {
@@ -436,6 +465,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)", "\tCALL\truntime\\.writebarrierptr\\(SB\\)"},
+               []string{},
        },
        // Rotate tests
        {
@@ -445,6 +475,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLQ\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -453,6 +484,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLQ\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -461,6 +493,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLQ\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -469,6 +502,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLL\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -477,6 +511,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLL\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -485,6 +520,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLL\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -493,6 +529,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -501,6 +538,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -509,6 +547,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -517,6 +556,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLB\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -525,6 +565,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLB\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -533,6 +574,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLB\t[$]7,"},
+               []string{},
        },
        // Rotate after inlining (see issue 18254).
        {
@@ -545,6 +587,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLL\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -553,6 +596,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVQ\t[$]5,"},
+               []string{},
        },
        // Direct use of constants in fast map access calls. Issue 19015.
        {
@@ -563,6 +607,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVQ\t[$]5,"},
+               []string{},
        },
        {
                `
@@ -571,6 +616,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\"abc\""},
+               []string{},
        },
        {
                `
@@ -580,6 +626,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\"abc\""},
+               []string{},
        },
        // Bit test ops on amd64, issue 18943.
        {
@@ -592,6 +639,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBTQ\t"},
+               []string{},
        },
        {
                `
@@ -600,6 +648,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBTQ\t"},
+               []string{},
        },
        {
                `
@@ -611,6 +660,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBTQ\t\\$60"},
+               []string{},
        },
        {
                `
@@ -619,6 +669,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBTQ\t\\$60"},
+               []string{},
        },
        // Intrinsic tests for math/bits
        {
@@ -628,6 +679,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSFQ\t", "\tMOVL\t\\$64,", "\tCMOVQEQ\t"},
+               []string{},
        },
        {
                `
@@ -636,6 +688,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSFQ\t", "\tORQ\t[^$]", "\tMOVQ\t\\$4294967296,"},
+               []string{},
        },
        {
                `
@@ -644,6 +697,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSFQ\t", "\tORQ\t\\$65536,"},
+               []string{},
        },
        {
                `
@@ -652,6 +706,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSFQ\t", "\tORQ\t\\$256,"},
+               []string{},
        },
        {
                `
@@ -660,6 +715,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPQ\t"},
+               []string{},
        },
        {
                `
@@ -668,6 +724,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSWAPL\t"},
+               []string{},
        },
        {
                `
@@ -676,6 +733,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tROLW\t\\$8,"},
+               []string{},
        },
        {
                `
@@ -684,6 +742,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        {
                `
@@ -692,6 +751,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        {
                `
@@ -700,6 +760,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        /* see ssa.go
        {
@@ -709,6 +770,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        */
        {
@@ -718,6 +780,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        {
                `
@@ -726,6 +789,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        {
                `
@@ -734,6 +798,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        {
                `
@@ -742,6 +807,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        /* see ssa.go
        {
@@ -751,6 +817,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        */
        {
@@ -760,6 +827,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tBSRQ\t"},
+               []string{},
        },
        {
                `
@@ -767,6 +835,7 @@ var linuxAMD64Tests = []*asmTest{
                        return bits.OnesCount64(x)
                }`,
                []string{"\tPOPCNTQ\t", "support_popcnt"},
+               []string{},
        },
        {
                `
@@ -774,6 +843,7 @@ var linuxAMD64Tests = []*asmTest{
                        return bits.OnesCount32(x)
                }`,
                []string{"\tPOPCNTL\t", "support_popcnt"},
+               []string{},
        },
        {
                `
@@ -781,6 +851,7 @@ var linuxAMD64Tests = []*asmTest{
                        return bits.OnesCount16(x)
                }`,
                []string{"\tPOPCNTL\t", "support_popcnt"},
+               []string{},
        },
        {
                `
@@ -788,6 +859,7 @@ var linuxAMD64Tests = []*asmTest{
                        return bits.OnesCount(x)
                }`,
                []string{"\tPOPCNTQ\t", "support_popcnt"},
+               []string{},
        },
        // multiplication merging tests
        {
@@ -796,6 +868,7 @@ var linuxAMD64Tests = []*asmTest{
                        return 15*n + 31*n
                }`,
                []string{"\tIMULQ\t[$]46"}, // 46*n
+               []string{},
        },
        {
                `
@@ -803,6 +876,7 @@ var linuxAMD64Tests = []*asmTest{
                        return 5*n + 7*(n+1) + 11*(n+2)
                }`,
                []string{"\tIMULQ\t[$]23", "\tADDQ\t[$]29"}, // 23*n + 29
+               []string{},
        },
        {
                `
@@ -810,6 +884,7 @@ var linuxAMD64Tests = []*asmTest{
                        return a*n + 19*n
                }`,
                []string{"\tADDQ\t[$]19", "\tIMULQ"}, // (a+19)*n
+               []string{},
        },
 
        // see issue 19595.
@@ -821,6 +896,7 @@ var linuxAMD64Tests = []*asmTest{
                        *q += x
                }`,
                []string{"\tADDQ\t\\("},
+               []string{},
        },
        {
                `
@@ -831,6 +907,7 @@ var linuxAMD64Tests = []*asmTest{
                        }
                }`,
                []string{"\tADDQ\t[A-Z]"},
+               []string{},
        },
        // Floating-point strength reduction
        {
@@ -839,6 +916,7 @@ var linuxAMD64Tests = []*asmTest{
                        return f * 2.0
                }`,
                []string{"\tADDSD\t"},
+               []string{},
        },
        {
                `
@@ -846,6 +924,7 @@ var linuxAMD64Tests = []*asmTest{
                        return f / 16.0
                }`,
                []string{"\tMULSD\t"},
+               []string{},
        },
        {
                `
@@ -853,6 +932,7 @@ var linuxAMD64Tests = []*asmTest{
                        return f / 0.125
                }`,
                []string{"\tMULSD\t"},
+               []string{},
        },
        {
                `
@@ -860,6 +940,7 @@ var linuxAMD64Tests = []*asmTest{
                        return f / 0.5
                }`,
                []string{"\tADDSD\t"},
+               []string{},
        },
        // Check that compare to constant string uses 2/4/8 byte compares
        {
@@ -868,6 +949,7 @@ var linuxAMD64Tests = []*asmTest{
                    return a == "xx"
                }`,
                []string{"\tCMPW\t[A-Z]"},
+               []string{},
        },
        {
                `
@@ -875,6 +957,7 @@ var linuxAMD64Tests = []*asmTest{
                    return a == "xxxx"
                }`,
                []string{"\tCMPL\t[A-Z]"},
+               []string{},
        },
        {
                `
@@ -882,6 +965,7 @@ var linuxAMD64Tests = []*asmTest{
                    return a == "xxxxxxxx"
                }`,
                []string{"\tCMPQ\t[A-Z]"},
+               []string{},
        },
        // Non-constant rotate
        {
@@ -890,6 +974,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x << z | x >> (64-z)
                }`,
                []string{"\tROLQ\t"},
+               []string{},
        },
        {
                `func rot64r(x uint64, y int) uint64 {
@@ -897,6 +982,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x >> z | x << (64-z)
                }`,
                []string{"\tRORQ\t"},
+               []string{},
        },
        {
                `func rot32l(x uint32, y int) uint32 {
@@ -904,6 +990,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x << z | x >> (32-z)
                }`,
                []string{"\tROLL\t"},
+               []string{},
        },
        {
                `func rot32r(x uint32, y int) uint32 {
@@ -911,6 +998,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x >> z | x << (32-z)
                }`,
                []string{"\tRORL\t"},
+               []string{},
        },
        {
                `func rot16l(x uint16, y int) uint16 {
@@ -918,6 +1006,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x << z | x >> (16-z)
                }`,
                []string{"\tROLW\t"},
+               []string{},
        },
        {
                `func rot16r(x uint16, y int) uint16 {
@@ -925,6 +1014,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x >> z | x << (16-z)
                }`,
                []string{"\tRORW\t"},
+               []string{},
        },
        {
                `func rot8l(x uint8, y int) uint8 {
@@ -932,6 +1022,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x << z | x >> (8-z)
                }`,
                []string{"\tROLB\t"},
+               []string{},
        },
        {
                `func rot8r(x uint8, y int) uint8 {
@@ -939,6 +1030,7 @@ var linuxAMD64Tests = []*asmTest{
                        return x >> z | x << (8-z)
                }`,
                []string{"\tRORB\t"},
+               []string{},
        },
        // Check that array compare uses 2/4/8 byte compares
        {
@@ -947,6 +1039,7 @@ var linuxAMD64Tests = []*asmTest{
                    return a == b
                }`,
                []string{"\tCMPW\t[A-Z]"},
+               []string{},
        },
        {
                `
@@ -954,6 +1047,7 @@ var linuxAMD64Tests = []*asmTest{
                    return a == b
                }`,
                []string{"\tCMPL\t[A-Z]"},
+               []string{},
        },
        {
                `
@@ -961,6 +1055,7 @@ var linuxAMD64Tests = []*asmTest{
                    return a == b
                }`,
                []string{"\tCMPQ\t[A-Z]"},
+               []string{},
        },
        {
                `
@@ -968,6 +1063,7 @@ var linuxAMD64Tests = []*asmTest{
                    return *((*[4]byte)(a)) != *((*[4]byte)(b))
                }`,
                []string{"\tCMPL\t[A-Z]"},
+               []string{},
        },
        {
                // make sure assembly output has matching offset and base register.
@@ -979,6 +1075,56 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"b\\+40\\(SP\\)"},
+               []string{},
+       },
+       {
+               // check load combining
+               `
+               func f73(a, b byte) (byte,byte) {
+                   return f73(f73(a,b))
+               }
+               `,
+               []string{"\tMOVW\t"},
+               []string{},
+       },
+       {
+               `
+               func f74(a, b uint16) (uint16,uint16) {
+                   return f74(f74(a,b))
+               }
+               `,
+               []string{"\tMOVL\t"},
+               []string{},
+       },
+       {
+               `
+               func f75(a, b uint32) (uint32,uint32) {
+                   return f75(f75(a,b))
+               }
+               `,
+               []string{"\tMOVQ\t"},
+               []string{},
+       },
+       {
+               `
+               func f76(a, b uint64) (uint64,uint64) {
+                   return f76(f76(a,b))
+               }
+               `,
+               []string{"\tMOVUPS\t"},
+               []string{},
+       },
+       // Make sure we don't put pointers in SSE registers across safe points.
+       {
+               `
+               func $(p, q *[2]*int)  {
+                   a, b := p[0], p[1]
+                   runtime.GC()
+                   q[0], q[1] = a, b
+               }
+               `,
+               []string{},
+               []string{"MOVUPS"},
        },
        {
                // check that stack store is optimized away
@@ -989,6 +1135,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"TEXT\t.*, [$]0-8"},
+               []string{},
        },
        // math.Abs using integer registers
        {
@@ -998,6 +1145,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,"},
+               []string{},
        },
        // math.Copysign using integer registers
        {
@@ -1007,6 +1155,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,", "\tSHRQ\t[$]63,", "\tSHLQ\t[$]63,", "\tORQ\t"},
+               []string{},
        },
        // int <-> fp moves
        {
@@ -1016,6 +1165,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVQ\tX.*, [^X].*"},
+               []string{},
        },
        {
                `
@@ -1024,6 +1174,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVL\tX.*, [^X].*"},
+               []string{},
        },
        {
                `
@@ -1032,6 +1183,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVQ\t[^X].*, X.*"},
+               []string{},
        },
        {
                `
@@ -1040,6 +1192,7 @@ var linuxAMD64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVL\t[^X].*, X.*"},
+               []string{},
        },
 }
 
@@ -1051,6 +1204,7 @@ var linux386Tests = []*asmTest{
                }
                `,
                []string{"\tMOVL\t\\(.*\\),"},
+               []string{},
        },
        {
                `
@@ -1059,6 +1213,7 @@ var linux386Tests = []*asmTest{
                }
                `,
                []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
+               []string{},
        },
 
        // multiplication merging tests
@@ -1068,6 +1223,7 @@ var linux386Tests = []*asmTest{
                        return 9*n + 14*n
                }`,
                []string{"\tIMULL\t[$]23"}, // 23*n
+               []string{},
        },
        {
                `
@@ -1075,6 +1231,7 @@ var linux386Tests = []*asmTest{
                        return 19*a + a*n
                }`,
                []string{"\tADDL\t[$]19", "\tIMULL"}, // (n+19)*a
+               []string{},
        },
        {
                // check that stack store is optimized away
@@ -1085,6 +1242,7 @@ var linux386Tests = []*asmTest{
                }
                `,
                []string{"TEXT\t.*, [$]0-4"},
+               []string{},
        },
 }
 
@@ -1096,6 +1254,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVWBR\t\\(.*\\),"},
+               []string{},
        },
        {
                `
@@ -1104,6 +1263,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVWBR\t\\(.*\\)\\(.*\\*1\\),"},
+               []string{},
        },
        {
                `
@@ -1112,6 +1272,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVDBR\t\\(.*\\),"},
+               []string{},
        },
        {
                `
@@ -1120,6 +1281,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVDBR\t\\(.*\\)\\(.*\\*1\\),"},
+               []string{},
        },
        {
                `
@@ -1128,6 +1290,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVWZ\t\\(.*\\),"},
+               []string{},
        },
        {
                `
@@ -1136,6 +1299,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVWZ\t\\(.*\\)\\(.*\\*1\\),"},
+               []string{},
        },
        {
                `
@@ -1144,6 +1308,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVD\t\\(.*\\),"},
+               []string{},
        },
        {
                `
@@ -1152,6 +1317,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVD\t\\(.*\\)\\(.*\\*1\\),"},
+               []string{},
        },
        {
                `
@@ -1160,6 +1326,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tRLLG\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -1168,6 +1335,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tRLLG\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -1176,6 +1344,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tRLLG\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -1184,6 +1353,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tRLL\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -1192,6 +1362,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tRLL\t[$]7,"},
+               []string{},
        },
        {
                `
@@ -1200,6 +1371,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tRLL\t[$]7,"},
+               []string{},
        },
        // Fused multiply-add/sub instructions.
        {
@@ -1209,6 +1381,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFMADD\t"},
+               []string{},
        },
        {
                `
@@ -1217,6 +1390,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFMSUB\t"},
+               []string{},
        },
        {
                `
@@ -1225,6 +1399,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFMADDS\t"},
+               []string{},
        },
        {
                `
@@ -1233,6 +1408,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFMSUBS\t"},
+               []string{},
        },
        // Intrinsic tests for math/bits
        {
@@ -1242,6 +1418,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1250,6 +1427,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t", "\tMOVWZ\t"},
+               []string{},
        },
        {
                `
@@ -1258,6 +1436,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t", "\tOR\t\\$65536,"},
+               []string{},
        },
        {
                `
@@ -1266,6 +1445,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t", "\tOR\t\\$256,"},
+               []string{},
        },
        // Intrinsic tests for math/bits
        {
@@ -1275,6 +1455,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVDBR\t"},
+               []string{},
        },
        {
                `
@@ -1283,6 +1464,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tMOVWBR\t"},
+               []string{},
        },
        {
                `
@@ -1291,6 +1473,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1299,6 +1482,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1307,6 +1491,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1315,6 +1500,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1323,6 +1509,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1331,6 +1518,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1339,6 +1527,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1347,6 +1536,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1355,6 +1545,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                `
@@ -1363,6 +1554,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"\tFLOGR\t"},
+               []string{},
        },
        {
                // check that stack store is optimized away
@@ -1373,6 +1565,7 @@ var linuxS390XTests = []*asmTest{
                }
                `,
                []string{"TEXT\t.*, [$]0-8"},
+               []string{},
        },
 }
 
@@ -1384,6 +1577,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tMOVW\tR[0-9]+@>25,"},
+               []string{},
        },
        {
                `
@@ -1392,6 +1586,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tMOVW\tR[0-9]+@>25,"},
+               []string{},
        },
        {
                `
@@ -1400,6 +1595,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tMOVW\tR[0-9]+@>25,"},
+               []string{},
        },
        {
                `
@@ -1408,6 +1604,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1416,6 +1613,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1424,6 +1622,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1432,6 +1631,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1440,6 +1640,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1448,6 +1649,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1456,6 +1658,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1464,6 +1667,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1472,6 +1676,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1480,6 +1685,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                // make sure assembly output has matching offset and base register.
@@ -1491,6 +1697,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"b\\+4\\(FP\\)"},
+               []string{},
        },
        {
                // check that stack store is optimized away
@@ -1501,6 +1708,7 @@ var linuxARMTests = []*asmTest{
                }
                `,
                []string{"TEXT\t.*, [$]-4-4"},
+               []string{},
        },
 }
 
@@ -1512,6 +1720,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tROR\t[$]57,"},
+               []string{},
        },
        {
                `
@@ -1520,6 +1729,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tROR\t[$]57,"},
+               []string{},
        },
        {
                `
@@ -1528,6 +1738,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tROR\t[$]57,"},
+               []string{},
        },
        {
                `
@@ -1536,6 +1747,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tRORW\t[$]25,"},
+               []string{},
        },
        {
                `
@@ -1544,6 +1756,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tRORW\t[$]25,"},
+               []string{},
        },
        {
                `
@@ -1552,6 +1765,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tRORW\t[$]25,"},
+               []string{},
        },
        {
                `
@@ -1560,6 +1774,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tREV\t"},
+               []string{},
        },
        {
                `
@@ -1568,6 +1783,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tREVW\t"},
+               []string{},
        },
        {
                `
@@ -1576,6 +1792,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1584,6 +1801,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1592,6 +1810,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1600,6 +1819,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1608,6 +1828,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1616,6 +1837,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1624,6 +1846,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1632,6 +1855,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1640,6 +1864,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1648,6 +1873,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1656,6 +1882,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tAND\t"},
+               []string{},
        },
        {
                `
@@ -1664,6 +1891,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tAND\t"},
+               []string{},
        },
        {
                // make sure offsets are folded into load and store.
@@ -1674,6 +1902,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"\tMOVD\t\"\"\\.a\\+[0-9]+\\(FP\\), R[0-9]+", "\tMOVD\tR[0-9]+, \"\"\\.b\\+[0-9]+\\(FP\\)"},
+               []string{},
        },
        {
                // check that stack store is optimized away
@@ -1684,6 +1913,7 @@ var linuxARM64Tests = []*asmTest{
                }
                `,
                []string{"TEXT\t.*, [$]-8-8"},
+               []string{},
        },
 }
 
@@ -1695,6 +1925,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1703,6 +1934,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1711,6 +1943,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1719,6 +1952,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1727,6 +1961,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1735,6 +1970,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1743,6 +1979,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1751,6 +1988,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1759,6 +1997,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                `
@@ -1767,6 +2006,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"\tCLZ\t"},
+               []string{},
        },
        {
                // check that stack store is optimized away
@@ -1777,6 +2017,7 @@ var linuxMIPSTests = []*asmTest{
                }
                `,
                []string{"TEXT\t.*, [$]-4-4"},
+               []string{},
        },
 }
 
@@ -1789,6 +2030,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tFMADD\t"},
+               []string{},
        },
        {
                `
@@ -1797,6 +2039,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tFMSUB\t"},
+               []string{},
        },
        {
                `
@@ -1805,6 +2048,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tFMADDS\t"},
+               []string{},
        },
        {
                `
@@ -1813,6 +2057,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tFMSUBS\t"},
+               []string{},
        },
        {
                `
@@ -1821,6 +2066,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tROTLW\t"},
+               []string{},
        },
        {
                `
@@ -1829,6 +2075,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tROTLW\t"},
+               []string{},
        },
        {
                `
@@ -1837,6 +2084,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tROTLW\t"},
+               []string{},
        },
        {
                `
@@ -1845,6 +2093,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tROTL\t"},
+               []string{},
        },
        {
                `
@@ -1853,6 +2102,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tROTL\t"},
+               []string{},
        },
        {
                `
@@ -1861,6 +2111,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"\tROTL\t"},
+               []string{},
        },
        {
                // check that stack store is optimized away
@@ -1871,6 +2122,7 @@ var linuxPPC64LETests = []*asmTest{
                }
                `,
                []string{"TEXT\t.*, [$]0-8"},
+               []string{},
        },
 }
 
index e7616a4ae639ceca2deae79dfb62eb6d872f0ed2..0e19e5970a50d40b4ef3d32a297bb61e938d8ef3 100644 (file)
   && clobber(x)
   -> (MOVQstoreidx1 [i-4] {s} p (SHLQconst <idx.Type> [2] idx) w0 mem)
 
+(MOVBstore [i] {s} p
+  x1:(MOVBload [j] {s2} p2 mem)
+    mem2:(MOVBstore [i-1] {s} p
+      x2:(MOVBload [j-1] {s2} p2 mem) mem))
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && mem2.Uses == 1
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(mem2)
+  -> (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem)
+
+(MOVWstore [i] {s} p
+  x1:(MOVWload [j] {s2} p2 mem)
+    mem2:(MOVWstore [i-2] {s} p
+      x2:(MOVWload [j-2] {s2} p2 mem) mem))
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && mem2.Uses == 1
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(mem2)
+  -> (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem)
+
+(MOVLstore [i] {s} p
+  x1:(MOVLload [j] {s2} p2 mem)
+    mem2:(MOVLstore [i-4] {s} p
+      x2:(MOVLload [j-4] {s2} p2 mem) mem))
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && mem2.Uses == 1
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(mem2)
+  -> (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem)
+
+// This is somewhat tricky. There may be pointers in SSE registers due to rule below.
+// However those register shouldn't live across GC safepoint.
+(MOVQstore [i] {s} p
+  x1:(MOVQload [j] {s2} p2 mem)
+    mem2:(MOVQstore [i-8] {s} p
+      x2:(MOVQload [j-8] {s2} p2 mem) mem))
+  && x1.Uses == 1
+  && x2.Uses == 1
+  && mem2.Uses == 1
+  && config.useSSE
+  && clobber(x1)
+  && clobber(x2)
+  && clobber(mem2)
+  -> (MOVOstore [i-8] {s} p (MOVOload [j-8] {s2} p2 mem) mem)
+
+
 // amd64p32 rules
 // same as the rules above, but with 32 instead of 64 bit pointer arithmetic.
 // LEAQ,ADDQ -> LEAL,ADDL
index 163790c970a9f989233bfa4845e411a3e5e790fb..e2971696bb3e9a4ee7bde99a6271763b07e209ec 100644 (file)
@@ -154,7 +154,7 @@ func rewriteValueAMD64(v *Value) bool {
        case OpAMD64MOVQloadidx8:
                return rewriteValueAMD64_OpAMD64MOVQloadidx8_0(v)
        case OpAMD64MOVQstore:
-               return rewriteValueAMD64_OpAMD64MOVQstore_0(v)
+               return rewriteValueAMD64_OpAMD64MOVQstore_0(v) || rewriteValueAMD64_OpAMD64MOVQstore_10(v)
        case OpAMD64MOVQstoreconst:
                return rewriteValueAMD64_OpAMD64MOVQstoreconst_0(v)
        case OpAMD64MOVQstoreconstidx1:
@@ -5690,6 +5690,10 @@ func rewriteValueAMD64_OpAMD64MOVBstore_0(v *Value) bool {
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVBstore_10(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
        // match: (MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
        // cond: x.Uses == 1   && clobber(x)
        // result: (MOVWstore [i-1] {s} p w mem)
@@ -5785,6 +5789,73 @@ func rewriteValueAMD64_OpAMD64MOVBstore_10(v *Value) bool {
                v.AddArg(mem)
                return true
        }
+       // match: (MOVBstore [i] {s} p x1:(MOVBload [j] {s2} p2 mem) mem2:(MOVBstore [i-1] {s} p x2:(MOVBload [j-1] {s2} p2 mem) mem))
+       // cond: x1.Uses == 1   && x2.Uses == 1   && mem2.Uses == 1   && clobber(x1)   && clobber(x2)   && clobber(mem2)
+       // result: (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               _ = v.Args[2]
+               p := v.Args[0]
+               x1 := v.Args[1]
+               if x1.Op != OpAMD64MOVBload {
+                       break
+               }
+               j := x1.AuxInt
+               s2 := x1.Aux
+               _ = x1.Args[1]
+               p2 := x1.Args[0]
+               mem := x1.Args[1]
+               mem2 := v.Args[2]
+               if mem2.Op != OpAMD64MOVBstore {
+                       break
+               }
+               if mem2.AuxInt != i-1 {
+                       break
+               }
+               if mem2.Aux != s {
+                       break
+               }
+               _ = mem2.Args[2]
+               if p != mem2.Args[0] {
+                       break
+               }
+               x2 := mem2.Args[1]
+               if x2.Op != OpAMD64MOVBload {
+                       break
+               }
+               if x2.AuxInt != j-1 {
+                       break
+               }
+               if x2.Aux != s2 {
+                       break
+               }
+               _ = x2.Args[1]
+               if p2 != x2.Args[0] {
+                       break
+               }
+               if mem != x2.Args[1] {
+                       break
+               }
+               if mem != mem2.Args[2] {
+                       break
+               }
+               if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) {
+                       break
+               }
+               v.reset(OpAMD64MOVWstore)
+               v.AuxInt = i - 1
+               v.Aux = s
+               v.AddArg(p)
+               v0 := b.NewValue0(v.Pos, OpAMD64MOVWload, typ.UInt16)
+               v0.AuxInt = j - 1
+               v0.Aux = s2
+               v0.AddArg(p2)
+               v0.AddArg(mem)
+               v.AddArg(v0)
+               v.AddArg(mem)
+               return true
+       }
        // match: (MOVBstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
        // cond: canMergeSym(sym1, sym2)
        // result: (MOVBstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
@@ -7810,6 +7881,77 @@ func rewriteValueAMD64_OpAMD64MOVLstore_0(v *Value) bool {
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVLstore_10(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (MOVLstore [i] {s} p x1:(MOVLload [j] {s2} p2 mem) mem2:(MOVLstore [i-4] {s} p x2:(MOVLload [j-4] {s2} p2 mem) mem))
+       // cond: x1.Uses == 1   && x2.Uses == 1   && mem2.Uses == 1   && clobber(x1)   && clobber(x2)   && clobber(mem2)
+       // result: (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               _ = v.Args[2]
+               p := v.Args[0]
+               x1 := v.Args[1]
+               if x1.Op != OpAMD64MOVLload {
+                       break
+               }
+               j := x1.AuxInt
+               s2 := x1.Aux
+               _ = x1.Args[1]
+               p2 := x1.Args[0]
+               mem := x1.Args[1]
+               mem2 := v.Args[2]
+               if mem2.Op != OpAMD64MOVLstore {
+                       break
+               }
+               if mem2.AuxInt != i-4 {
+                       break
+               }
+               if mem2.Aux != s {
+                       break
+               }
+               _ = mem2.Args[2]
+               if p != mem2.Args[0] {
+                       break
+               }
+               x2 := mem2.Args[1]
+               if x2.Op != OpAMD64MOVLload {
+                       break
+               }
+               if x2.AuxInt != j-4 {
+                       break
+               }
+               if x2.Aux != s2 {
+                       break
+               }
+               _ = x2.Args[1]
+               if p2 != x2.Args[0] {
+                       break
+               }
+               if mem != x2.Args[1] {
+                       break
+               }
+               if mem != mem2.Args[2] {
+                       break
+               }
+               if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) {
+                       break
+               }
+               v.reset(OpAMD64MOVQstore)
+               v.AuxInt = i - 4
+               v.Aux = s
+               v.AddArg(p)
+               v0 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64)
+               v0.AuxInt = j - 4
+               v0.Aux = s2
+               v0.AddArg(p2)
+               v0.AddArg(mem)
+               v.AddArg(v0)
+               v.AddArg(mem)
+               return true
+       }
        // match: (MOVLstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
        // cond: canMergeSym(sym1, sym2)
        // result: (MOVLstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
@@ -9345,6 +9487,10 @@ func rewriteValueAMD64_OpAMD64MOVQloadidx8_0(v *Value) bool {
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
        // match: (MOVQstore [off1] {sym} (ADDQconst [off2] ptr) val mem)
        // cond: is32Bit(off1+off2)
        // result: (MOVQstore  [off1+off2] {sym} ptr val mem)
@@ -9510,6 +9656,73 @@ func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool {
                v.AddArg(mem)
                return true
        }
+       // match: (MOVQstore [i] {s} p x1:(MOVQload [j] {s2} p2 mem) mem2:(MOVQstore [i-8] {s} p x2:(MOVQload [j-8] {s2} p2 mem) mem))
+       // cond: x1.Uses == 1   && x2.Uses == 1   && mem2.Uses == 1   && config.useSSE   && clobber(x1)   && clobber(x2)   && clobber(mem2)
+       // result: (MOVOstore [i-8] {s} p (MOVOload [j-8] {s2} p2 mem) mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               _ = v.Args[2]
+               p := v.Args[0]
+               x1 := v.Args[1]
+               if x1.Op != OpAMD64MOVQload {
+                       break
+               }
+               j := x1.AuxInt
+               s2 := x1.Aux
+               _ = x1.Args[1]
+               p2 := x1.Args[0]
+               mem := x1.Args[1]
+               mem2 := v.Args[2]
+               if mem2.Op != OpAMD64MOVQstore {
+                       break
+               }
+               if mem2.AuxInt != i-8 {
+                       break
+               }
+               if mem2.Aux != s {
+                       break
+               }
+               _ = mem2.Args[2]
+               if p != mem2.Args[0] {
+                       break
+               }
+               x2 := mem2.Args[1]
+               if x2.Op != OpAMD64MOVQload {
+                       break
+               }
+               if x2.AuxInt != j-8 {
+                       break
+               }
+               if x2.Aux != s2 {
+                       break
+               }
+               _ = x2.Args[1]
+               if p2 != x2.Args[0] {
+                       break
+               }
+               if mem != x2.Args[1] {
+                       break
+               }
+               if mem != mem2.Args[2] {
+                       break
+               }
+               if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && config.useSSE && clobber(x1) && clobber(x2) && clobber(mem2)) {
+                       break
+               }
+               v.reset(OpAMD64MOVOstore)
+               v.AuxInt = i - 8
+               v.Aux = s
+               v.AddArg(p)
+               v0 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128)
+               v0.AuxInt = j - 8
+               v0.Aux = s2
+               v0.AddArg(p2)
+               v0.AddArg(mem)
+               v.AddArg(v0)
+               v.AddArg(mem)
+               return true
+       }
        // match: (MOVQstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
        // cond: canMergeSym(sym1, sym2)
        // result: (MOVQstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
@@ -9602,6 +9815,9 @@ func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool {
                v.AddArg(mem)
                return true
        }
+       return false
+}
+func rewriteValueAMD64_OpAMD64MOVQstore_10(v *Value) bool {
        // match: (MOVQstore [off] {sym} ptr (MOVQf2i val) mem)
        // cond:
        // result: (MOVSDstore [off] {sym} ptr val mem)
@@ -12334,6 +12550,77 @@ func rewriteValueAMD64_OpAMD64MOVWstore_0(v *Value) bool {
        return false
 }
 func rewriteValueAMD64_OpAMD64MOVWstore_10(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (MOVWstore [i] {s} p x1:(MOVWload [j] {s2} p2 mem) mem2:(MOVWstore [i-2] {s} p x2:(MOVWload [j-2] {s2} p2 mem) mem))
+       // cond: x1.Uses == 1   && x2.Uses == 1   && mem2.Uses == 1   && clobber(x1)   && clobber(x2)   && clobber(mem2)
+       // result: (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem)
+       for {
+               i := v.AuxInt
+               s := v.Aux
+               _ = v.Args[2]
+               p := v.Args[0]
+               x1 := v.Args[1]
+               if x1.Op != OpAMD64MOVWload {
+                       break
+               }
+               j := x1.AuxInt
+               s2 := x1.Aux
+               _ = x1.Args[1]
+               p2 := x1.Args[0]
+               mem := x1.Args[1]
+               mem2 := v.Args[2]
+               if mem2.Op != OpAMD64MOVWstore {
+                       break
+               }
+               if mem2.AuxInt != i-2 {
+                       break
+               }
+               if mem2.Aux != s {
+                       break
+               }
+               _ = mem2.Args[2]
+               if p != mem2.Args[0] {
+                       break
+               }
+               x2 := mem2.Args[1]
+               if x2.Op != OpAMD64MOVWload {
+                       break
+               }
+               if x2.AuxInt != j-2 {
+                       break
+               }
+               if x2.Aux != s2 {
+                       break
+               }
+               _ = x2.Args[1]
+               if p2 != x2.Args[0] {
+                       break
+               }
+               if mem != x2.Args[1] {
+                       break
+               }
+               if mem != mem2.Args[2] {
+                       break
+               }
+               if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) {
+                       break
+               }
+               v.reset(OpAMD64MOVLstore)
+               v.AuxInt = i - 2
+               v.Aux = s
+               v.AddArg(p)
+               v0 := b.NewValue0(v.Pos, OpAMD64MOVLload, typ.UInt32)
+               v0.AuxInt = j - 2
+               v0.Aux = s2
+               v0.AddArg(p2)
+               v0.AddArg(mem)
+               v.AddArg(v0)
+               v.AddArg(mem)
+               return true
+       }
        // match: (MOVWstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
        // cond: canMergeSym(sym1, sym2)
        // result: (MOVWstore  [off1+off2] {mergeSym(sym1,sym2)} base val mem)