From: Ilya Tocar Date: Fri, 18 Aug 2017 19:03:33 +0000 (-0500) Subject: cmd/compile/internal/ssa: combine consecutive loads and stores on amd64 X-Git-Tag: go1.10beta1~1361 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=9c99512d188b01557f5271b2c65d814487817920;p=gostls13.git cmd/compile/internal/ssa: combine consecutive loads and stores on amd64 Sometimes (often for calls) we generate code like this: MOVQ (addr),AX MOVQ 8(addr),BX MOVQ AX,(otheraddr) MOVQ BX,8(otheraddr) Replace it with MOVUPS (addr),X0 MOVUPS X0,(otheraddr) For completeness do the same for 8,16,32-bit loads/stores too. Shaves 1% from code sections of go tool. /localdisk/itocar/golang/bin/go 10293917 go_old 10334877 [40960 bytes] read-only data = 682 bytes (0.040769%) global text (code) = 38961 bytes (1.036503%) Total difference 39643 bytes (0.674628%) Updates #6853 Change-Id: I1f0d2f60273a63a079b58927cd1c4e3429d2e7ae Reviewed-on: https://go-review.googlesource.com/57130 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index 08df053e8d..bfd1f8a784 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -24,9 +24,10 @@ import ( // architecture-specific, and they are grouped in arrays of tests, one // for each architecture. // -// Each asmTest consists in a function to be compiled and an array of -// regexps that will be matched to the generated assembly. For -// example, the following amd64 test +// Each asmTest consists of a function to compile, an array of +// positiveRegexps that will be matched to the generated assembly and +// an array of negativeRegexps that must not match generated assembly. +// For example, the following amd64 test // // { // ` @@ -35,10 +36,11 @@ import ( // } // `, // []string{"\tSHLQ\t[$]6,"}, +// []string{"MULQ"} // } // // verifies that the code the compiler generates for a multiplication -// by 64 contains a 'SHLQ' instruction. +// by 64 contains a 'SHLQ' instruction and does not contain a MULQ. // // Since all the tests for a given architecture are dumped in the same // file, the function names must be unique. As a workaround for this @@ -52,6 +54,7 @@ import ( // } // `, // []string{"\tSHLQ\t[$]6,"}, +// []string{"MULQ"} // } // // Each '$'-function will be given a unique name of form f_, @@ -124,16 +127,22 @@ func funcAsm(t *testing.T, asm string, funcName string) string { type asmTest struct { // function to compile function string - // regexps that must match the generated assembly - regexps []string + // positiveRegexps that must match the generated assembly + positiveRegexps []string + negativeRegexps []string } func (at asmTest) verifyAsm(t *testing.T, fa string) { - for _, r := range at.regexps { + for _, r := range at.positiveRegexps { if b, err := regexp.MatchString(r, fa); !b || err != nil { t.Errorf("expected:%s\ngo:%s\nasm:%s\n", r, at.function, fa) } } + for _, r := range at.negativeRegexps { + if b, err := regexp.MatchString(r, fa); b || err != nil { + t.Errorf("not expected:%s\ngo:%s\nasm:%s\n", r, at.function, fa) + } + } } type asmTests struct { @@ -214,7 +223,7 @@ var allAsmTests = []*asmTests{ { arch: "amd64", os: "linux", - imports: []string{"encoding/binary", "math", "math/bits", "unsafe"}, + imports: []string{"encoding/binary", "math", "math/bits", "unsafe", "runtime"}, tests: linuxAMD64Tests, }, { @@ -262,6 +271,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tSHLQ\t\\$6,"}, + []string{}, }, { ` @@ -270,6 +280,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tSHLQ\t\\$5,", "\tLEAQ\t\\(.*\\)\\(.*\\*2\\),"}, + []string{}, }, // Load-combining tests. { @@ -279,6 +290,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVQ\t\\(.*\\),"}, + []string{}, }, { ` @@ -287,6 +299,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVQ\t\\(.*\\)\\(.*\\*1\\),"}, + []string{}, }, { ` @@ -295,6 +308,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVL\t\\(.*\\),"}, + []string{}, }, { ` @@ -303,6 +317,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"}, + []string{}, }, { ` @@ -311,6 +326,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPQ\t"}, + []string{}, }, { ` @@ -319,6 +335,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPQ\t"}, + []string{}, }, { ` @@ -327,6 +344,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPQ\t"}, + []string{}, }, { ` @@ -335,6 +353,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPQ\t"}, + []string{}, }, { ` @@ -343,6 +362,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPL\t"}, + []string{}, }, { ` @@ -351,6 +371,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPL\t"}, + []string{}, }, { ` @@ -359,6 +380,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPL\t"}, + []string{}, }, { ` @@ -367,6 +389,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPL\t"}, + []string{}, }, { ` @@ -375,6 +398,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t\\$8,"}, + []string{}, }, { ` @@ -383,6 +407,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t\\$8,"}, + []string{}, }, { ` @@ -391,6 +416,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t\\$8,"}, + []string{}, }, { ` @@ -399,6 +425,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t\\$8,"}, + []string{}, }, // Structure zeroing. See issue #18370. { @@ -411,6 +438,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)"}, + []string{}, }, // SSA-able composite literal initialization. Issue 18872. { @@ -424,6 +452,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVQ\t[$]1", "\tMOVQ\t[$]2", "\tMOVQ\t[$]3", "\tMOVQ\t[$]4"}, + []string{}, }, // Also test struct containing pointers (this was special because of write barriers). { @@ -436,6 +465,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)", "\tCALL\truntime\\.writebarrierptr\\(SB\\)"}, + []string{}, }, // Rotate tests { @@ -445,6 +475,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLQ\t[$]7,"}, + []string{}, }, { ` @@ -453,6 +484,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLQ\t[$]7,"}, + []string{}, }, { ` @@ -461,6 +493,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLQ\t[$]7,"}, + []string{}, }, { ` @@ -469,6 +502,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLL\t[$]7,"}, + []string{}, }, { ` @@ -477,6 +511,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLL\t[$]7,"}, + []string{}, }, { ` @@ -485,6 +520,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLL\t[$]7,"}, + []string{}, }, { ` @@ -493,6 +529,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t[$]7,"}, + []string{}, }, { ` @@ -501,6 +538,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t[$]7,"}, + []string{}, }, { ` @@ -509,6 +547,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t[$]7,"}, + []string{}, }, { ` @@ -517,6 +556,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLB\t[$]7,"}, + []string{}, }, { ` @@ -525,6 +565,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLB\t[$]7,"}, + []string{}, }, { ` @@ -533,6 +574,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLB\t[$]7,"}, + []string{}, }, // Rotate after inlining (see issue 18254). { @@ -545,6 +587,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLL\t[$]7,"}, + []string{}, }, { ` @@ -553,6 +596,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVQ\t[$]5,"}, + []string{}, }, // Direct use of constants in fast map access calls. Issue 19015. { @@ -563,6 +607,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVQ\t[$]5,"}, + []string{}, }, { ` @@ -571,6 +616,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\"abc\""}, + []string{}, }, { ` @@ -580,6 +626,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\"abc\""}, + []string{}, }, // Bit test ops on amd64, issue 18943. { @@ -592,6 +639,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBTQ\t"}, + []string{}, }, { ` @@ -600,6 +648,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBTQ\t"}, + []string{}, }, { ` @@ -611,6 +660,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBTQ\t\\$60"}, + []string{}, }, { ` @@ -619,6 +669,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBTQ\t\\$60"}, + []string{}, }, // Intrinsic tests for math/bits { @@ -628,6 +679,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSFQ\t", "\tMOVL\t\\$64,", "\tCMOVQEQ\t"}, + []string{}, }, { ` @@ -636,6 +688,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSFQ\t", "\tORQ\t[^$]", "\tMOVQ\t\\$4294967296,"}, + []string{}, }, { ` @@ -644,6 +697,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSFQ\t", "\tORQ\t\\$65536,"}, + []string{}, }, { ` @@ -652,6 +706,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSFQ\t", "\tORQ\t\\$256,"}, + []string{}, }, { ` @@ -660,6 +715,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPQ\t"}, + []string{}, }, { ` @@ -668,6 +724,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSWAPL\t"}, + []string{}, }, { ` @@ -676,6 +733,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tROLW\t\\$8,"}, + []string{}, }, { ` @@ -684,6 +742,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, { ` @@ -692,6 +751,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, { ` @@ -700,6 +760,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, /* see ssa.go { @@ -709,6 +770,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, */ { @@ -718,6 +780,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, { ` @@ -726,6 +789,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, { ` @@ -734,6 +798,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, { ` @@ -742,6 +807,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, /* see ssa.go { @@ -751,6 +817,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, */ { @@ -760,6 +827,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tBSRQ\t"}, + []string{}, }, { ` @@ -767,6 +835,7 @@ var linuxAMD64Tests = []*asmTest{ return bits.OnesCount64(x) }`, []string{"\tPOPCNTQ\t", "support_popcnt"}, + []string{}, }, { ` @@ -774,6 +843,7 @@ var linuxAMD64Tests = []*asmTest{ return bits.OnesCount32(x) }`, []string{"\tPOPCNTL\t", "support_popcnt"}, + []string{}, }, { ` @@ -781,6 +851,7 @@ var linuxAMD64Tests = []*asmTest{ return bits.OnesCount16(x) }`, []string{"\tPOPCNTL\t", "support_popcnt"}, + []string{}, }, { ` @@ -788,6 +859,7 @@ var linuxAMD64Tests = []*asmTest{ return bits.OnesCount(x) }`, []string{"\tPOPCNTQ\t", "support_popcnt"}, + []string{}, }, // multiplication merging tests { @@ -796,6 +868,7 @@ var linuxAMD64Tests = []*asmTest{ return 15*n + 31*n }`, []string{"\tIMULQ\t[$]46"}, // 46*n + []string{}, }, { ` @@ -803,6 +876,7 @@ var linuxAMD64Tests = []*asmTest{ return 5*n + 7*(n+1) + 11*(n+2) }`, []string{"\tIMULQ\t[$]23", "\tADDQ\t[$]29"}, // 23*n + 29 + []string{}, }, { ` @@ -810,6 +884,7 @@ var linuxAMD64Tests = []*asmTest{ return a*n + 19*n }`, []string{"\tADDQ\t[$]19", "\tIMULQ"}, // (a+19)*n + []string{}, }, // see issue 19595. @@ -821,6 +896,7 @@ var linuxAMD64Tests = []*asmTest{ *q += x }`, []string{"\tADDQ\t\\("}, + []string{}, }, { ` @@ -831,6 +907,7 @@ var linuxAMD64Tests = []*asmTest{ } }`, []string{"\tADDQ\t[A-Z]"}, + []string{}, }, // Floating-point strength reduction { @@ -839,6 +916,7 @@ var linuxAMD64Tests = []*asmTest{ return f * 2.0 }`, []string{"\tADDSD\t"}, + []string{}, }, { ` @@ -846,6 +924,7 @@ var linuxAMD64Tests = []*asmTest{ return f / 16.0 }`, []string{"\tMULSD\t"}, + []string{}, }, { ` @@ -853,6 +932,7 @@ var linuxAMD64Tests = []*asmTest{ return f / 0.125 }`, []string{"\tMULSD\t"}, + []string{}, }, { ` @@ -860,6 +940,7 @@ var linuxAMD64Tests = []*asmTest{ return f / 0.5 }`, []string{"\tADDSD\t"}, + []string{}, }, // Check that compare to constant string uses 2/4/8 byte compares { @@ -868,6 +949,7 @@ var linuxAMD64Tests = []*asmTest{ return a == "xx" }`, []string{"\tCMPW\t[A-Z]"}, + []string{}, }, { ` @@ -875,6 +957,7 @@ var linuxAMD64Tests = []*asmTest{ return a == "xxxx" }`, []string{"\tCMPL\t[A-Z]"}, + []string{}, }, { ` @@ -882,6 +965,7 @@ var linuxAMD64Tests = []*asmTest{ return a == "xxxxxxxx" }`, []string{"\tCMPQ\t[A-Z]"}, + []string{}, }, // Non-constant rotate { @@ -890,6 +974,7 @@ var linuxAMD64Tests = []*asmTest{ return x << z | x >> (64-z) }`, []string{"\tROLQ\t"}, + []string{}, }, { `func rot64r(x uint64, y int) uint64 { @@ -897,6 +982,7 @@ var linuxAMD64Tests = []*asmTest{ return x >> z | x << (64-z) }`, []string{"\tRORQ\t"}, + []string{}, }, { `func rot32l(x uint32, y int) uint32 { @@ -904,6 +990,7 @@ var linuxAMD64Tests = []*asmTest{ return x << z | x >> (32-z) }`, []string{"\tROLL\t"}, + []string{}, }, { `func rot32r(x uint32, y int) uint32 { @@ -911,6 +998,7 @@ var linuxAMD64Tests = []*asmTest{ return x >> z | x << (32-z) }`, []string{"\tRORL\t"}, + []string{}, }, { `func rot16l(x uint16, y int) uint16 { @@ -918,6 +1006,7 @@ var linuxAMD64Tests = []*asmTest{ return x << z | x >> (16-z) }`, []string{"\tROLW\t"}, + []string{}, }, { `func rot16r(x uint16, y int) uint16 { @@ -925,6 +1014,7 @@ var linuxAMD64Tests = []*asmTest{ return x >> z | x << (16-z) }`, []string{"\tRORW\t"}, + []string{}, }, { `func rot8l(x uint8, y int) uint8 { @@ -932,6 +1022,7 @@ var linuxAMD64Tests = []*asmTest{ return x << z | x >> (8-z) }`, []string{"\tROLB\t"}, + []string{}, }, { `func rot8r(x uint8, y int) uint8 { @@ -939,6 +1030,7 @@ var linuxAMD64Tests = []*asmTest{ return x >> z | x << (8-z) }`, []string{"\tRORB\t"}, + []string{}, }, // Check that array compare uses 2/4/8 byte compares { @@ -947,6 +1039,7 @@ var linuxAMD64Tests = []*asmTest{ return a == b }`, []string{"\tCMPW\t[A-Z]"}, + []string{}, }, { ` @@ -954,6 +1047,7 @@ var linuxAMD64Tests = []*asmTest{ return a == b }`, []string{"\tCMPL\t[A-Z]"}, + []string{}, }, { ` @@ -961,6 +1055,7 @@ var linuxAMD64Tests = []*asmTest{ return a == b }`, []string{"\tCMPQ\t[A-Z]"}, + []string{}, }, { ` @@ -968,6 +1063,7 @@ var linuxAMD64Tests = []*asmTest{ return *((*[4]byte)(a)) != *((*[4]byte)(b)) }`, []string{"\tCMPL\t[A-Z]"}, + []string{}, }, { // make sure assembly output has matching offset and base register. @@ -979,6 +1075,56 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"b\\+40\\(SP\\)"}, + []string{}, + }, + { + // check load combining + ` + func f73(a, b byte) (byte,byte) { + return f73(f73(a,b)) + } + `, + []string{"\tMOVW\t"}, + []string{}, + }, + { + ` + func f74(a, b uint16) (uint16,uint16) { + return f74(f74(a,b)) + } + `, + []string{"\tMOVL\t"}, + []string{}, + }, + { + ` + func f75(a, b uint32) (uint32,uint32) { + return f75(f75(a,b)) + } + `, + []string{"\tMOVQ\t"}, + []string{}, + }, + { + ` + func f76(a, b uint64) (uint64,uint64) { + return f76(f76(a,b)) + } + `, + []string{"\tMOVUPS\t"}, + []string{}, + }, + // Make sure we don't put pointers in SSE registers across safe points. + { + ` + func $(p, q *[2]*int) { + a, b := p[0], p[1] + runtime.GC() + q[0], q[1] = a, b + } + `, + []string{}, + []string{"MOVUPS"}, }, { // check that stack store is optimized away @@ -989,6 +1135,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"TEXT\t.*, [$]0-8"}, + []string{}, }, // math.Abs using integer registers { @@ -998,6 +1145,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,"}, + []string{}, }, // math.Copysign using integer registers { @@ -1007,6 +1155,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,", "\tSHRQ\t[$]63,", "\tSHLQ\t[$]63,", "\tORQ\t"}, + []string{}, }, // int <-> fp moves { @@ -1016,6 +1165,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVQ\tX.*, [^X].*"}, + []string{}, }, { ` @@ -1024,6 +1174,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVL\tX.*, [^X].*"}, + []string{}, }, { ` @@ -1032,6 +1183,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVQ\t[^X].*, X.*"}, + []string{}, }, { ` @@ -1040,6 +1192,7 @@ var linuxAMD64Tests = []*asmTest{ } `, []string{"\tMOVL\t[^X].*, X.*"}, + []string{}, }, } @@ -1051,6 +1204,7 @@ var linux386Tests = []*asmTest{ } `, []string{"\tMOVL\t\\(.*\\),"}, + []string{}, }, { ` @@ -1059,6 +1213,7 @@ var linux386Tests = []*asmTest{ } `, []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"}, + []string{}, }, // multiplication merging tests @@ -1068,6 +1223,7 @@ var linux386Tests = []*asmTest{ return 9*n + 14*n }`, []string{"\tIMULL\t[$]23"}, // 23*n + []string{}, }, { ` @@ -1075,6 +1231,7 @@ var linux386Tests = []*asmTest{ return 19*a + a*n }`, []string{"\tADDL\t[$]19", "\tIMULL"}, // (n+19)*a + []string{}, }, { // check that stack store is optimized away @@ -1085,6 +1242,7 @@ var linux386Tests = []*asmTest{ } `, []string{"TEXT\t.*, [$]0-4"}, + []string{}, }, } @@ -1096,6 +1254,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVWBR\t\\(.*\\),"}, + []string{}, }, { ` @@ -1104,6 +1263,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVWBR\t\\(.*\\)\\(.*\\*1\\),"}, + []string{}, }, { ` @@ -1112,6 +1272,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVDBR\t\\(.*\\),"}, + []string{}, }, { ` @@ -1120,6 +1281,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVDBR\t\\(.*\\)\\(.*\\*1\\),"}, + []string{}, }, { ` @@ -1128,6 +1290,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVWZ\t\\(.*\\),"}, + []string{}, }, { ` @@ -1136,6 +1299,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVWZ\t\\(.*\\)\\(.*\\*1\\),"}, + []string{}, }, { ` @@ -1144,6 +1308,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVD\t\\(.*\\),"}, + []string{}, }, { ` @@ -1152,6 +1317,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVD\t\\(.*\\)\\(.*\\*1\\),"}, + []string{}, }, { ` @@ -1160,6 +1326,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tRLLG\t[$]7,"}, + []string{}, }, { ` @@ -1168,6 +1335,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tRLLG\t[$]7,"}, + []string{}, }, { ` @@ -1176,6 +1344,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tRLLG\t[$]7,"}, + []string{}, }, { ` @@ -1184,6 +1353,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tRLL\t[$]7,"}, + []string{}, }, { ` @@ -1192,6 +1362,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tRLL\t[$]7,"}, + []string{}, }, { ` @@ -1200,6 +1371,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tRLL\t[$]7,"}, + []string{}, }, // Fused multiply-add/sub instructions. { @@ -1209,6 +1381,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFMADD\t"}, + []string{}, }, { ` @@ -1217,6 +1390,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFMSUB\t"}, + []string{}, }, { ` @@ -1225,6 +1399,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFMADDS\t"}, + []string{}, }, { ` @@ -1233,6 +1408,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFMSUBS\t"}, + []string{}, }, // Intrinsic tests for math/bits { @@ -1242,6 +1418,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1250,6 +1427,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t", "\tMOVWZ\t"}, + []string{}, }, { ` @@ -1258,6 +1436,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t", "\tOR\t\\$65536,"}, + []string{}, }, { ` @@ -1266,6 +1445,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t", "\tOR\t\\$256,"}, + []string{}, }, // Intrinsic tests for math/bits { @@ -1275,6 +1455,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVDBR\t"}, + []string{}, }, { ` @@ -1283,6 +1464,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tMOVWBR\t"}, + []string{}, }, { ` @@ -1291,6 +1473,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1299,6 +1482,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1307,6 +1491,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1315,6 +1500,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1323,6 +1509,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1331,6 +1518,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1339,6 +1527,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1347,6 +1536,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1355,6 +1545,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { ` @@ -1363,6 +1554,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"\tFLOGR\t"}, + []string{}, }, { // check that stack store is optimized away @@ -1373,6 +1565,7 @@ var linuxS390XTests = []*asmTest{ } `, []string{"TEXT\t.*, [$]0-8"}, + []string{}, }, } @@ -1384,6 +1577,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tMOVW\tR[0-9]+@>25,"}, + []string{}, }, { ` @@ -1392,6 +1586,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tMOVW\tR[0-9]+@>25,"}, + []string{}, }, { ` @@ -1400,6 +1595,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tMOVW\tR[0-9]+@>25,"}, + []string{}, }, { ` @@ -1408,6 +1604,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1416,6 +1613,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1424,6 +1622,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1432,6 +1631,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1440,6 +1640,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1448,6 +1649,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1456,6 +1658,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1464,6 +1667,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1472,6 +1676,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1480,6 +1685,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { // make sure assembly output has matching offset and base register. @@ -1491,6 +1697,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"b\\+4\\(FP\\)"}, + []string{}, }, { // check that stack store is optimized away @@ -1501,6 +1708,7 @@ var linuxARMTests = []*asmTest{ } `, []string{"TEXT\t.*, [$]-4-4"}, + []string{}, }, } @@ -1512,6 +1720,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tROR\t[$]57,"}, + []string{}, }, { ` @@ -1520,6 +1729,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tROR\t[$]57,"}, + []string{}, }, { ` @@ -1528,6 +1738,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tROR\t[$]57,"}, + []string{}, }, { ` @@ -1536,6 +1747,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tRORW\t[$]25,"}, + []string{}, }, { ` @@ -1544,6 +1756,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tRORW\t[$]25,"}, + []string{}, }, { ` @@ -1552,6 +1765,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tRORW\t[$]25,"}, + []string{}, }, { ` @@ -1560,6 +1774,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tREV\t"}, + []string{}, }, { ` @@ -1568,6 +1783,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tREVW\t"}, + []string{}, }, { ` @@ -1576,6 +1792,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1584,6 +1801,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1592,6 +1810,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1600,6 +1819,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1608,6 +1828,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1616,6 +1837,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1624,6 +1846,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1632,6 +1855,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1640,6 +1864,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1648,6 +1873,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1656,6 +1882,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tAND\t"}, + []string{}, }, { ` @@ -1664,6 +1891,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tAND\t"}, + []string{}, }, { // make sure offsets are folded into load and store. @@ -1674,6 +1902,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"\tMOVD\t\"\"\\.a\\+[0-9]+\\(FP\\), R[0-9]+", "\tMOVD\tR[0-9]+, \"\"\\.b\\+[0-9]+\\(FP\\)"}, + []string{}, }, { // check that stack store is optimized away @@ -1684,6 +1913,7 @@ var linuxARM64Tests = []*asmTest{ } `, []string{"TEXT\t.*, [$]-8-8"}, + []string{}, }, } @@ -1695,6 +1925,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1703,6 +1934,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1711,6 +1943,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1719,6 +1952,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1727,6 +1961,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1735,6 +1970,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1743,6 +1979,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1751,6 +1988,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1759,6 +1997,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { ` @@ -1767,6 +2006,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"\tCLZ\t"}, + []string{}, }, { // check that stack store is optimized away @@ -1777,6 +2017,7 @@ var linuxMIPSTests = []*asmTest{ } `, []string{"TEXT\t.*, [$]-4-4"}, + []string{}, }, } @@ -1789,6 +2030,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tFMADD\t"}, + []string{}, }, { ` @@ -1797,6 +2039,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tFMSUB\t"}, + []string{}, }, { ` @@ -1805,6 +2048,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tFMADDS\t"}, + []string{}, }, { ` @@ -1813,6 +2057,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tFMSUBS\t"}, + []string{}, }, { ` @@ -1821,6 +2066,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tROTLW\t"}, + []string{}, }, { ` @@ -1829,6 +2075,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tROTLW\t"}, + []string{}, }, { ` @@ -1837,6 +2084,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tROTLW\t"}, + []string{}, }, { ` @@ -1845,6 +2093,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tROTL\t"}, + []string{}, }, { ` @@ -1853,6 +2102,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tROTL\t"}, + []string{}, }, { ` @@ -1861,6 +2111,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"\tROTL\t"}, + []string{}, }, { // check that stack store is optimized away @@ -1871,6 +2122,7 @@ var linuxPPC64LETests = []*asmTest{ } `, []string{"TEXT\t.*, [$]0-8"}, + []string{}, }, } diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index e7616a4ae6..0e19e5970a 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -2327,6 +2327,58 @@ && clobber(x) -> (MOVQstoreidx1 [i-4] {s} p (SHLQconst [2] idx) w0 mem) +(MOVBstore [i] {s} p + x1:(MOVBload [j] {s2} p2 mem) + mem2:(MOVBstore [i-1] {s} p + x2:(MOVBload [j-1] {s2} p2 mem) mem)) + && x1.Uses == 1 + && x2.Uses == 1 + && mem2.Uses == 1 + && clobber(x1) + && clobber(x2) + && clobber(mem2) + -> (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem) + +(MOVWstore [i] {s} p + x1:(MOVWload [j] {s2} p2 mem) + mem2:(MOVWstore [i-2] {s} p + x2:(MOVWload [j-2] {s2} p2 mem) mem)) + && x1.Uses == 1 + && x2.Uses == 1 + && mem2.Uses == 1 + && clobber(x1) + && clobber(x2) + && clobber(mem2) + -> (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem) + +(MOVLstore [i] {s} p + x1:(MOVLload [j] {s2} p2 mem) + mem2:(MOVLstore [i-4] {s} p + x2:(MOVLload [j-4] {s2} p2 mem) mem)) + && x1.Uses == 1 + && x2.Uses == 1 + && mem2.Uses == 1 + && clobber(x1) + && clobber(x2) + && clobber(mem2) + -> (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem) + +// This is somewhat tricky. There may be pointers in SSE registers due to rule below. +// However those register shouldn't live across GC safepoint. +(MOVQstore [i] {s} p + x1:(MOVQload [j] {s2} p2 mem) + mem2:(MOVQstore [i-8] {s} p + x2:(MOVQload [j-8] {s2} p2 mem) mem)) + && x1.Uses == 1 + && x2.Uses == 1 + && mem2.Uses == 1 + && config.useSSE + && clobber(x1) + && clobber(x2) + && clobber(mem2) + -> (MOVOstore [i-8] {s} p (MOVOload [j-8] {s2} p2 mem) mem) + + // amd64p32 rules // same as the rules above, but with 32 instead of 64 bit pointer arithmetic. // LEAQ,ADDQ -> LEAL,ADDL diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 163790c970..e2971696bb 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -154,7 +154,7 @@ func rewriteValueAMD64(v *Value) bool { case OpAMD64MOVQloadidx8: return rewriteValueAMD64_OpAMD64MOVQloadidx8_0(v) case OpAMD64MOVQstore: - return rewriteValueAMD64_OpAMD64MOVQstore_0(v) + return rewriteValueAMD64_OpAMD64MOVQstore_0(v) || rewriteValueAMD64_OpAMD64MOVQstore_10(v) case OpAMD64MOVQstoreconst: return rewriteValueAMD64_OpAMD64MOVQstoreconst_0(v) case OpAMD64MOVQstoreconstidx1: @@ -5690,6 +5690,10 @@ func rewriteValueAMD64_OpAMD64MOVBstore_0(v *Value) bool { return false } func rewriteValueAMD64_OpAMD64MOVBstore_10(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ // match: (MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem)) // cond: x.Uses == 1 && clobber(x) // result: (MOVWstore [i-1] {s} p w mem) @@ -5785,6 +5789,73 @@ func rewriteValueAMD64_OpAMD64MOVBstore_10(v *Value) bool { v.AddArg(mem) return true } + // match: (MOVBstore [i] {s} p x1:(MOVBload [j] {s2} p2 mem) mem2:(MOVBstore [i-1] {s} p x2:(MOVBload [j-1] {s2} p2 mem) mem)) + // cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2) + // result: (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[2] + p := v.Args[0] + x1 := v.Args[1] + if x1.Op != OpAMD64MOVBload { + break + } + j := x1.AuxInt + s2 := x1.Aux + _ = x1.Args[1] + p2 := x1.Args[0] + mem := x1.Args[1] + mem2 := v.Args[2] + if mem2.Op != OpAMD64MOVBstore { + break + } + if mem2.AuxInt != i-1 { + break + } + if mem2.Aux != s { + break + } + _ = mem2.Args[2] + if p != mem2.Args[0] { + break + } + x2 := mem2.Args[1] + if x2.Op != OpAMD64MOVBload { + break + } + if x2.AuxInt != j-1 { + break + } + if x2.Aux != s2 { + break + } + _ = x2.Args[1] + if p2 != x2.Args[0] { + break + } + if mem != x2.Args[1] { + break + } + if mem != mem2.Args[2] { + break + } + if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) { + break + } + v.reset(OpAMD64MOVWstore) + v.AuxInt = i - 1 + v.Aux = s + v.AddArg(p) + v0 := b.NewValue0(v.Pos, OpAMD64MOVWload, typ.UInt16) + v0.AuxInt = j - 1 + v0.Aux = s2 + v0.AddArg(p2) + v0.AddArg(mem) + v.AddArg(v0) + v.AddArg(mem) + return true + } // match: (MOVBstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) // cond: canMergeSym(sym1, sym2) // result: (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) @@ -7810,6 +7881,77 @@ func rewriteValueAMD64_OpAMD64MOVLstore_0(v *Value) bool { return false } func rewriteValueAMD64_OpAMD64MOVLstore_10(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (MOVLstore [i] {s} p x1:(MOVLload [j] {s2} p2 mem) mem2:(MOVLstore [i-4] {s} p x2:(MOVLload [j-4] {s2} p2 mem) mem)) + // cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2) + // result: (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[2] + p := v.Args[0] + x1 := v.Args[1] + if x1.Op != OpAMD64MOVLload { + break + } + j := x1.AuxInt + s2 := x1.Aux + _ = x1.Args[1] + p2 := x1.Args[0] + mem := x1.Args[1] + mem2 := v.Args[2] + if mem2.Op != OpAMD64MOVLstore { + break + } + if mem2.AuxInt != i-4 { + break + } + if mem2.Aux != s { + break + } + _ = mem2.Args[2] + if p != mem2.Args[0] { + break + } + x2 := mem2.Args[1] + if x2.Op != OpAMD64MOVLload { + break + } + if x2.AuxInt != j-4 { + break + } + if x2.Aux != s2 { + break + } + _ = x2.Args[1] + if p2 != x2.Args[0] { + break + } + if mem != x2.Args[1] { + break + } + if mem != mem2.Args[2] { + break + } + if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) { + break + } + v.reset(OpAMD64MOVQstore) + v.AuxInt = i - 4 + v.Aux = s + v.AddArg(p) + v0 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64) + v0.AuxInt = j - 4 + v0.Aux = s2 + v0.AddArg(p2) + v0.AddArg(mem) + v.AddArg(v0) + v.AddArg(mem) + return true + } // match: (MOVLstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) // cond: canMergeSym(sym1, sym2) // result: (MOVLstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) @@ -9345,6 +9487,10 @@ func rewriteValueAMD64_OpAMD64MOVQloadidx8_0(v *Value) bool { return false } func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool { + b := v.Block + _ = b + config := b.Func.Config + _ = config // match: (MOVQstore [off1] {sym} (ADDQconst [off2] ptr) val mem) // cond: is32Bit(off1+off2) // result: (MOVQstore [off1+off2] {sym} ptr val mem) @@ -9510,6 +9656,73 @@ func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool { v.AddArg(mem) return true } + // match: (MOVQstore [i] {s} p x1:(MOVQload [j] {s2} p2 mem) mem2:(MOVQstore [i-8] {s} p x2:(MOVQload [j-8] {s2} p2 mem) mem)) + // cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && config.useSSE && clobber(x1) && clobber(x2) && clobber(mem2) + // result: (MOVOstore [i-8] {s} p (MOVOload [j-8] {s2} p2 mem) mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[2] + p := v.Args[0] + x1 := v.Args[1] + if x1.Op != OpAMD64MOVQload { + break + } + j := x1.AuxInt + s2 := x1.Aux + _ = x1.Args[1] + p2 := x1.Args[0] + mem := x1.Args[1] + mem2 := v.Args[2] + if mem2.Op != OpAMD64MOVQstore { + break + } + if mem2.AuxInt != i-8 { + break + } + if mem2.Aux != s { + break + } + _ = mem2.Args[2] + if p != mem2.Args[0] { + break + } + x2 := mem2.Args[1] + if x2.Op != OpAMD64MOVQload { + break + } + if x2.AuxInt != j-8 { + break + } + if x2.Aux != s2 { + break + } + _ = x2.Args[1] + if p2 != x2.Args[0] { + break + } + if mem != x2.Args[1] { + break + } + if mem != mem2.Args[2] { + break + } + if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && config.useSSE && clobber(x1) && clobber(x2) && clobber(mem2)) { + break + } + v.reset(OpAMD64MOVOstore) + v.AuxInt = i - 8 + v.Aux = s + v.AddArg(p) + v0 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128) + v0.AuxInt = j - 8 + v0.Aux = s2 + v0.AddArg(p2) + v0.AddArg(mem) + v.AddArg(v0) + v.AddArg(mem) + return true + } // match: (MOVQstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) // cond: canMergeSym(sym1, sym2) // result: (MOVQstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) @@ -9602,6 +9815,9 @@ func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool { v.AddArg(mem) return true } + return false +} +func rewriteValueAMD64_OpAMD64MOVQstore_10(v *Value) bool { // match: (MOVQstore [off] {sym} ptr (MOVQf2i val) mem) // cond: // result: (MOVSDstore [off] {sym} ptr val mem) @@ -12334,6 +12550,77 @@ func rewriteValueAMD64_OpAMD64MOVWstore_0(v *Value) bool { return false } func rewriteValueAMD64_OpAMD64MOVWstore_10(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (MOVWstore [i] {s} p x1:(MOVWload [j] {s2} p2 mem) mem2:(MOVWstore [i-2] {s} p x2:(MOVWload [j-2] {s2} p2 mem) mem)) + // cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2) + // result: (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[2] + p := v.Args[0] + x1 := v.Args[1] + if x1.Op != OpAMD64MOVWload { + break + } + j := x1.AuxInt + s2 := x1.Aux + _ = x1.Args[1] + p2 := x1.Args[0] + mem := x1.Args[1] + mem2 := v.Args[2] + if mem2.Op != OpAMD64MOVWstore { + break + } + if mem2.AuxInt != i-2 { + break + } + if mem2.Aux != s { + break + } + _ = mem2.Args[2] + if p != mem2.Args[0] { + break + } + x2 := mem2.Args[1] + if x2.Op != OpAMD64MOVWload { + break + } + if x2.AuxInt != j-2 { + break + } + if x2.Aux != s2 { + break + } + _ = x2.Args[1] + if p2 != x2.Args[0] { + break + } + if mem != x2.Args[1] { + break + } + if mem != mem2.Args[2] { + break + } + if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) { + break + } + v.reset(OpAMD64MOVLstore) + v.AuxInt = i - 2 + v.Aux = s + v.AddArg(p) + v0 := b.NewValue0(v.Pos, OpAMD64MOVLload, typ.UInt32) + v0.AuxInt = j - 2 + v0.Aux = s2 + v0.AddArg(p2) + v0.AddArg(mem) + v.AddArg(v0) + v.AddArg(mem) + return true + } // match: (MOVWstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) // cond: canMergeSym(sym1, sym2) // result: (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)