]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: optimize 386 code with MULLload/DIVSSload/DIVSDload
authorBen Shi <powerman1st@163.com>
Sun, 24 Jun 2018 07:04:21 +0000 (07:04 +0000)
committerBen Shi <powerman1st@163.com>
Wed, 22 Aug 2018 03:38:38 +0000 (03:38 +0000)
IMULL/DIVSS/DIVSD all can take the source operand from memory
directly. And this CL implement that optimization.

1. The total size of pkg/linux_386 decreases about 84KB (excluding
cmd/compile).

2. The go1 benchmark shows little regression in total (excluding noise).
name                     old time/op    new time/op    delta
BinaryTree17-4              3.29s ± 2%     3.27s ± 4%    ~     (p=0.192 n=30+30)
Fannkuch11-4                3.49s ± 2%     3.54s ± 1%  +1.48%  (p=0.000 n=30+30)
FmtFprintfEmpty-4          45.9ns ± 3%    46.3ns ± 4%  +0.89%  (p=0.037 n=30+30)
FmtFprintfString-4         78.8ns ± 3%    78.7ns ± 4%    ~     (p=0.209 n=30+27)
FmtFprintfInt-4            91.0ns ± 2%    90.3ns ± 2%  -0.82%  (p=0.031 n=30+27)
FmtFprintfIntInt-4          142ns ± 4%     143ns ± 4%    ~     (p=0.136 n=30+30)
FmtFprintfPrefixedInt-4     181ns ± 3%     183ns ± 4%  +1.40%  (p=0.005 n=30+30)
FmtFprintfFloat-4           404ns ± 4%     408ns ± 3%    ~     (p=0.397 n=30+30)
FmtManyArgs-4               601ns ± 3%     609ns ± 5%    ~     (p=0.059 n=30+30)
GobDecode-4                7.21ms ± 5%    7.24ms ± 5%    ~     (p=0.612 n=30+30)
GobEncode-4                6.91ms ± 6%    6.91ms ± 6%    ~     (p=0.797 n=30+30)
Gzip-4                      398ms ± 6%     399ms ± 4%    ~     (p=0.173 n=30+30)
Gunzip-4                   41.7ms ± 3%    41.8ms ± 3%    ~     (p=0.423 n=30+30)
HTTPClientServer-4         62.3µs ± 2%    62.7µs ± 3%    ~     (p=0.085 n=29+30)
JSONEncode-4               21.0ms ± 4%    20.7ms ± 5%  -1.39%  (p=0.014 n=30+30)
JSONDecode-4               66.3ms ± 3%    67.4ms ± 1%  +1.71%  (p=0.003 n=30+24)
Mandelbrot200-4            5.15ms ± 3%    5.16ms ± 3%    ~     (p=0.697 n=30+30)
GoParse-4                  3.24ms ± 3%    3.27ms ± 4%  +0.91%  (p=0.032 n=30+30)
RegexpMatchEasy0_32-4       101ns ± 5%      99ns ± 4%  -1.82%  (p=0.008 n=29+30)
RegexpMatchEasy0_1K-4       848ns ± 4%     841ns ± 2%  -0.77%  (p=0.043 n=30+30)
RegexpMatchEasy1_32-4       106ns ± 6%     106ns ± 3%    ~     (p=0.939 n=29+30)
RegexpMatchEasy1_1K-4      1.02µs ± 3%    1.03µs ± 4%    ~     (p=0.297 n=28+30)
RegexpMatchMedium_32-4      129ns ± 4%     127ns ± 4%    ~     (p=0.073 n=30+30)
RegexpMatchMedium_1K-4     43.9µs ± 3%    43.8µs ± 3%    ~     (p=0.186 n=30+30)
RegexpMatchHard_32-4       2.24µs ± 4%    2.22µs ± 4%    ~     (p=0.332 n=30+29)
RegexpMatchHard_1K-4       68.0µs ± 4%    67.5µs ± 3%    ~     (p=0.290 n=30+30)
Revcomp-4                   1.85s ± 3%     1.85s ± 3%    ~     (p=0.358 n=30+30)
Template-4                 69.6ms ± 3%    70.0ms ± 4%    ~     (p=0.273 n=30+30)
TimeParse-4                 445ns ± 3%     441ns ± 3%    ~     (p=0.494 n=30+30)
TimeFormat-4                412ns ± 3%     412ns ± 6%    ~     (p=0.841 n=30+30)
[Geo mean]                 66.7µs         66.8µs       +0.13%

name                     old speed      new speed      delta
GobDecode-4               107MB/s ± 5%   106MB/s ± 5%    ~     (p=0.615 n=30+30)
GobEncode-4               111MB/s ± 6%   111MB/s ± 6%    ~     (p=0.790 n=30+30)
Gzip-4                   48.8MB/s ± 6%  48.7MB/s ± 4%    ~     (p=0.167 n=30+30)
Gunzip-4                  465MB/s ± 3%   465MB/s ± 3%    ~     (p=0.420 n=30+30)
JSONEncode-4             92.4MB/s ± 4%  93.7MB/s ± 5%  +1.42%  (p=0.015 n=30+30)
JSONDecode-4             29.3MB/s ± 3%  28.8MB/s ± 1%  -1.72%  (p=0.003 n=30+24)
GoParse-4                17.9MB/s ± 3%  17.7MB/s ± 4%  -0.89%  (p=0.037 n=30+30)
RegexpMatchEasy0_32-4     317MB/s ± 8%   324MB/s ± 4%  +2.14%  (p=0.006 n=30+30)
RegexpMatchEasy0_1K-4    1.21GB/s ± 4%  1.22GB/s ± 2%  +0.77%  (p=0.036 n=30+30)
RegexpMatchEasy1_32-4     298MB/s ± 7%   299MB/s ± 4%    ~     (p=0.511 n=30+30)
RegexpMatchEasy1_1K-4    1.00GB/s ± 3%  1.00GB/s ± 4%    ~     (p=0.304 n=28+30)
RegexpMatchMedium_32-4   7.75MB/s ± 4%  7.82MB/s ± 4%    ~     (p=0.089 n=30+30)
RegexpMatchMedium_1K-4   23.3MB/s ± 3%  23.4MB/s ± 3%    ~     (p=0.181 n=30+30)
RegexpMatchHard_32-4     14.3MB/s ± 4%  14.4MB/s ± 4%    ~     (p=0.320 n=30+29)
RegexpMatchHard_1K-4     15.1MB/s ± 4%  15.2MB/s ± 3%    ~     (p=0.273 n=30+30)
Revcomp-4                 137MB/s ± 3%   137MB/s ± 3%    ~     (p=0.352 n=30+30)
Template-4               27.9MB/s ± 3%  27.7MB/s ± 4%    ~     (p=0.277 n=30+30)
[Geo mean]               79.9MB/s       80.1MB/s       +0.15%

Change-Id: I97333cd8ddabb3c7c88ca5aa9e14a005b74d306d
Reviewed-on: https://go-review.googlesource.com/120695
Run-TryBot: Ben Shi <powerman1st@163.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/ssa/gen/386.rules
src/cmd/compile/internal/ssa/gen/386Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewrite386.go
src/cmd/compile/internal/x86/ssa.go
test/codegen/arithmetic.go

index 94f24a81ef50785193ef4ea1a6f35706e9b09712..127829473b2b3b2906c42ff992f8dd2b8539f57f 100644 (file)
 (MOVSSstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSSstore [off1+off2] {sym} ptr val mem)
 (MOVSDstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSDstore [off1+off2] {sym} ptr val mem)
 
-((ADD|SUB|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
-       ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
-((ADD|SUB|MUL)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
-       ((ADD|SUB|MUL)SSload [off1+off2] {sym} val base mem)
-((ADD|SUB|MUL)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
-       ((ADD|SUB|MUL)SDload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
+       ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
+       ((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
+       ((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
 ((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDLconst [off2] base) val mem) && is32Bit(off1+off2) ->
        ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem)
 
 (MOVSDstore [off1] {sym1} (LEAL8 [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
        (MOVSDstoreidx8 [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
 
-((ADD|SUB|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
        && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-       ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
-((ADD|SUB|MUL)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+       ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
        && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-       ((ADD|SUB|MUL)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
-((ADD|SUB|MUL)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+       ((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
        && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-       ((ADD|SUB|MUL)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+       ((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
 ((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
        && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
        ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 (MOVSDstoreidx8 [c] {sym} ptr (ADDLconst [d] idx) val mem) -> (MOVSDstoreidx8 [int64(int32(c+8*d))] {sym} ptr idx val mem)
 
 // Merge load/store to op
-((ADD|AND|OR|XOR|SUB)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|AND|OR|XOR|SUB)Lload x [off] {sym} ptr mem)
-((ADD|SUB|MUL)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL)SDload x [off] {sym} ptr mem)
-((ADD|SUB|MUL)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL)SSload x [off] {sym} ptr mem)
+((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
 (MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) -> ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
 (MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y) && clobber(l) ->
        ((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
index 7a274269d26aa713d6bf8955d07cfc2ad67e0a1a..40f4a2b15e3f59e9bc02ee8fc78b84e4fe92b1a2 100644 (file)
@@ -183,6 +183,8 @@ func init() {
                {name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
                {name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
                {name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+               {name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+               {name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
 
                // binary ops
                {name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                // arg0 + arg1
@@ -279,11 +281,12 @@ func init() {
                {name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
                {name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true},  // arg0 rotate left auxint, rotate amount 0-7
 
-               {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
-               {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
-               {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
-               {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
-               {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+               {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 + tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+               {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 - tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+               {name: "MULLload", argLength: 3, reg: gp21load, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+               {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 & tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+               {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},    // arg0 | tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
+               {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"},  // arg0 ^ tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
 
                // unary ops
                {name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
index 355462384097faa2e8b750c2e765c8f0176985bc..8ac47cb2d0db2f5a0db66a8c7ab6e1fa38c82667 100644 (file)
@@ -262,6 +262,8 @@ const (
        Op386SUBSDload
        Op386MULSSload
        Op386MULSDload
+       Op386DIVSSload
+       Op386DIVSDload
        Op386ADDL
        Op386ADDLconst
        Op386ADDLcarry
@@ -333,6 +335,7 @@ const (
        Op386ROLBconst
        Op386ADDLload
        Op386SUBLload
+       Op386MULLload
        Op386ANDLload
        Op386ORLload
        Op386XORLload
@@ -2752,6 +2755,42 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:           "DIVSSload",
+               auxType:        auxSymOff,
+               argLen:         3,
+               resultInArg0:   true,
+               faultOnNilArg1: true,
+               symEffect:      SymRead,
+               asm:            x86.ADIVSS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
+                               {1, 65791}, // AX CX DX BX SP BP SI DI SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
+                       },
+               },
+       },
+       {
+               name:           "DIVSDload",
+               auxType:        auxSymOff,
+               argLen:         3,
+               resultInArg0:   true,
+               faultOnNilArg1: true,
+               symEffect:      SymRead,
+               asm:            x86.ADIVSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
+                               {1, 65791}, // AX CX DX BX SP BP SI DI SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
+                       },
+               },
+       },
        {
                name:         "ADDL",
                argLen:       2,
@@ -3821,6 +3860,25 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:           "MULLload",
+               auxType:        auxSymOff,
+               argLen:         3,
+               resultInArg0:   true,
+               clobberFlags:   true,
+               faultOnNilArg1: true,
+               symEffect:      SymRead,
+               asm:            x86.AIMULL,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 239},   // AX CX DX BX BP SI DI
+                               {1, 65791}, // AX CX DX BX SP BP SI DI SB
+                       },
+                       outputs: []outputInfo{
+                               {0, 239}, // AX CX DX BX BP SI DI
+                       },
+               },
+       },
        {
                name:           "ANDLload",
                auxType:        auxSymOff,
index db2c62089d6de6b1505e71280f1c10c09755306f..039538ea7dce5bea52d6e687114be3c4e8de44d4 100644 (file)
@@ -61,6 +61,14 @@ func rewriteValue386(v *Value) bool {
                return rewriteValue386_Op386CMPWconst_0(v)
        case Op386CMPWload:
                return rewriteValue386_Op386CMPWload_0(v)
+       case Op386DIVSD:
+               return rewriteValue386_Op386DIVSD_0(v)
+       case Op386DIVSDload:
+               return rewriteValue386_Op386DIVSDload_0(v)
+       case Op386DIVSS:
+               return rewriteValue386_Op386DIVSS_0(v)
+       case Op386DIVSSload:
+               return rewriteValue386_Op386DIVSSload_0(v)
        case Op386LEAL:
                return rewriteValue386_Op386LEAL_0(v)
        case Op386LEAL1:
@@ -163,6 +171,8 @@ func rewriteValue386(v *Value) bool {
                return rewriteValue386_Op386MULL_0(v)
        case Op386MULLconst:
                return rewriteValue386_Op386MULLconst_0(v) || rewriteValue386_Op386MULLconst_10(v) || rewriteValue386_Op386MULLconst_20(v) || rewriteValue386_Op386MULLconst_30(v)
+       case Op386MULLload:
+               return rewriteValue386_Op386MULLload_0(v)
        case Op386MULSD:
                return rewriteValue386_Op386MULSD_0(v)
        case Op386MULSDload:
@@ -3098,6 +3108,192 @@ func rewriteValue386_Op386CMPWload_0(v *Value) bool {
        }
        return false
 }
+func rewriteValue386_Op386DIVSD_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
+       // match: (DIVSD x l:(MOVSDload [off] {sym} ptr mem))
+       // cond: canMergeLoad(v, l, x) && !config.use387 && clobber(l)
+       // result: (DIVSDload x [off] {sym} ptr mem)
+       for {
+               _ = v.Args[1]
+               x := v.Args[0]
+               l := v.Args[1]
+               if l.Op != Op386MOVSDload {
+                       break
+               }
+               off := l.AuxInt
+               sym := l.Aux
+               _ = l.Args[1]
+               ptr := l.Args[0]
+               mem := l.Args[1]
+               if !(canMergeLoad(v, l, x) && !config.use387 && clobber(l)) {
+                       break
+               }
+               v.reset(Op386DIVSDload)
+               v.AuxInt = off
+               v.Aux = sym
+               v.AddArg(x)
+               v.AddArg(ptr)
+               v.AddArg(mem)
+               return true
+       }
+       return false
+}
+func rewriteValue386_Op386DIVSDload_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
+       // match: (DIVSDload [off1] {sym} val (ADDLconst [off2] base) mem)
+       // cond: is32Bit(off1+off2)
+       // result: (DIVSDload [off1+off2] {sym} val base mem)
+       for {
+               off1 := v.AuxInt
+               sym := v.Aux
+               _ = v.Args[2]
+               val := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != Op386ADDLconst {
+                       break
+               }
+               off2 := v_1.AuxInt
+               base := v_1.Args[0]
+               mem := v.Args[2]
+               if !(is32Bit(off1 + off2)) {
+                       break
+               }
+               v.reset(Op386DIVSDload)
+               v.AuxInt = off1 + off2
+               v.Aux = sym
+               v.AddArg(val)
+               v.AddArg(base)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (DIVSDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+       // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)
+       // result: (DIVSDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+       for {
+               off1 := v.AuxInt
+               sym1 := v.Aux
+               _ = v.Args[2]
+               val := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != Op386LEAL {
+                       break
+               }
+               off2 := v_1.AuxInt
+               sym2 := v_1.Aux
+               base := v_1.Args[0]
+               mem := v.Args[2]
+               if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)) {
+                       break
+               }
+               v.reset(Op386DIVSDload)
+               v.AuxInt = off1 + off2
+               v.Aux = mergeSym(sym1, sym2)
+               v.AddArg(val)
+               v.AddArg(base)
+               v.AddArg(mem)
+               return true
+       }
+       return false
+}
+func rewriteValue386_Op386DIVSS_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
+       // match: (DIVSS x l:(MOVSSload [off] {sym} ptr mem))
+       // cond: canMergeLoad(v, l, x) && !config.use387 && clobber(l)
+       // result: (DIVSSload x [off] {sym} ptr mem)
+       for {
+               _ = v.Args[1]
+               x := v.Args[0]
+               l := v.Args[1]
+               if l.Op != Op386MOVSSload {
+                       break
+               }
+               off := l.AuxInt
+               sym := l.Aux
+               _ = l.Args[1]
+               ptr := l.Args[0]
+               mem := l.Args[1]
+               if !(canMergeLoad(v, l, x) && !config.use387 && clobber(l)) {
+                       break
+               }
+               v.reset(Op386DIVSSload)
+               v.AuxInt = off
+               v.Aux = sym
+               v.AddArg(x)
+               v.AddArg(ptr)
+               v.AddArg(mem)
+               return true
+       }
+       return false
+}
+func rewriteValue386_Op386DIVSSload_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
+       // match: (DIVSSload [off1] {sym} val (ADDLconst [off2] base) mem)
+       // cond: is32Bit(off1+off2)
+       // result: (DIVSSload [off1+off2] {sym} val base mem)
+       for {
+               off1 := v.AuxInt
+               sym := v.Aux
+               _ = v.Args[2]
+               val := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != Op386ADDLconst {
+                       break
+               }
+               off2 := v_1.AuxInt
+               base := v_1.Args[0]
+               mem := v.Args[2]
+               if !(is32Bit(off1 + off2)) {
+                       break
+               }
+               v.reset(Op386DIVSSload)
+               v.AuxInt = off1 + off2
+               v.Aux = sym
+               v.AddArg(val)
+               v.AddArg(base)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (DIVSSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+       // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)
+       // result: (DIVSSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+       for {
+               off1 := v.AuxInt
+               sym1 := v.Aux
+               _ = v.Args[2]
+               val := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != Op386LEAL {
+                       break
+               }
+               off2 := v_1.AuxInt
+               sym2 := v_1.Aux
+               base := v_1.Args[0]
+               mem := v.Args[2]
+               if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)) {
+                       break
+               }
+               v.reset(Op386DIVSSload)
+               v.AuxInt = off1 + off2
+               v.Aux = mergeSym(sym1, sym2)
+               v.AddArg(val)
+               v.AddArg(base)
+               v.AddArg(mem)
+               return true
+       }
+       return false
+}
 func rewriteValue386_Op386LEAL_0(v *Value) bool {
        // match: (LEAL [c] {s} (ADDLconst [d] x))
        // cond: is32Bit(c+d)
@@ -9825,6 +10021,58 @@ func rewriteValue386_Op386MULL_0(v *Value) bool {
                v.AddArg(x)
                return true
        }
+       // match: (MULL x l:(MOVLload [off] {sym} ptr mem))
+       // cond: canMergeLoad(v, l, x) && clobber(l)
+       // result: (MULLload x [off] {sym} ptr mem)
+       for {
+               _ = v.Args[1]
+               x := v.Args[0]
+               l := v.Args[1]
+               if l.Op != Op386MOVLload {
+                       break
+               }
+               off := l.AuxInt
+               sym := l.Aux
+               _ = l.Args[1]
+               ptr := l.Args[0]
+               mem := l.Args[1]
+               if !(canMergeLoad(v, l, x) && clobber(l)) {
+                       break
+               }
+               v.reset(Op386MULLload)
+               v.AuxInt = off
+               v.Aux = sym
+               v.AddArg(x)
+               v.AddArg(ptr)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (MULL l:(MOVLload [off] {sym} ptr mem) x)
+       // cond: canMergeLoad(v, l, x) && clobber(l)
+       // result: (MULLload x [off] {sym} ptr mem)
+       for {
+               _ = v.Args[1]
+               l := v.Args[0]
+               if l.Op != Op386MOVLload {
+                       break
+               }
+               off := l.AuxInt
+               sym := l.Aux
+               _ = l.Args[1]
+               ptr := l.Args[0]
+               mem := l.Args[1]
+               x := v.Args[1]
+               if !(canMergeLoad(v, l, x) && clobber(l)) {
+                       break
+               }
+               v.reset(Op386MULLload)
+               v.AuxInt = off
+               v.Aux = sym
+               v.AddArg(x)
+               v.AddArg(ptr)
+               v.AddArg(mem)
+               return true
+       }
        return false
 }
 func rewriteValue386_Op386MULLconst_0(v *Value) bool {
@@ -10332,6 +10580,66 @@ func rewriteValue386_Op386MULLconst_30(v *Value) bool {
        }
        return false
 }
+func rewriteValue386_Op386MULLload_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       config := b.Func.Config
+       _ = config
+       // match: (MULLload [off1] {sym} val (ADDLconst [off2] base) mem)
+       // cond: is32Bit(off1+off2)
+       // result: (MULLload [off1+off2] {sym} val base mem)
+       for {
+               off1 := v.AuxInt
+               sym := v.Aux
+               _ = v.Args[2]
+               val := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != Op386ADDLconst {
+                       break
+               }
+               off2 := v_1.AuxInt
+               base := v_1.Args[0]
+               mem := v.Args[2]
+               if !(is32Bit(off1 + off2)) {
+                       break
+               }
+               v.reset(Op386MULLload)
+               v.AuxInt = off1 + off2
+               v.Aux = sym
+               v.AddArg(val)
+               v.AddArg(base)
+               v.AddArg(mem)
+               return true
+       }
+       // match: (MULLload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
+       // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)
+       // result: (MULLload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+       for {
+               off1 := v.AuxInt
+               sym1 := v.Aux
+               _ = v.Args[2]
+               val := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != Op386LEAL {
+                       break
+               }
+               off2 := v_1.AuxInt
+               sym2 := v_1.Aux
+               base := v_1.Args[0]
+               mem := v.Args[2]
+               if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)) {
+                       break
+               }
+               v.reset(Op386MULLload)
+               v.AuxInt = off1 + off2
+               v.Aux = mergeSym(sym1, sym2)
+               v.AddArg(val)
+               v.AddArg(base)
+               v.AddArg(mem)
+               return true
+       }
+       return false
+}
 func rewriteValue386_Op386MULSD_0(v *Value) bool {
        b := v.Block
        _ = b
index d75a55c5659a228583a45dc8c33f509b49d5a2ab..7cdff863b2b99abdb56b7ce07765678ca685949d 100644 (file)
@@ -525,8 +525,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                gc.AddAux(&p.From, v)
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg()
-       case ssa.Op386ADDLload, ssa.Op386SUBLload, ssa.Op386ANDLload, ssa.Op386ORLload, ssa.Op386XORLload,
-               ssa.Op386ADDSDload, ssa.Op386ADDSSload, ssa.Op386SUBSDload, ssa.Op386SUBSSload, ssa.Op386MULSDload, ssa.Op386MULSSload:
+       case ssa.Op386ADDLload, ssa.Op386SUBLload, ssa.Op386MULLload,
+               ssa.Op386ANDLload, ssa.Op386ORLload, ssa.Op386XORLload,
+               ssa.Op386ADDSDload, ssa.Op386ADDSSload, ssa.Op386SUBSDload, ssa.Op386SUBSSload,
+               ssa.Op386MULSDload, ssa.Op386MULSSload, ssa.Op386DIVSSload, ssa.Op386DIVSDload:
                p := s.Prog(v.Op.Asm())
                p.From.Type = obj.TYPE_MEM
                p.From.Reg = v.Args[1].Reg()
index a7f2906db9b1fb405fd4f2efd449766aaa26e017..3c063d873605a827585e896fe75d89b3c7b0baba 100644 (file)
@@ -49,6 +49,13 @@ func Mul_96(n int) int {
        return n * 96
 }
 
+func MulMemSrc(a []uint32, b []float32) {
+       // 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+`
+       a[0] *= a[1]
+       // 386/sse2:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
+       b[0] *= b[1]
+}
+
 // Multiplications merging tests
 
 func MergeMuls1(n int) int {
@@ -85,6 +92,11 @@ func MergeMuls5(a, n int) int {
 //    Division    //
 // -------------- //
 
+func DivMemSrc(a []float64) {
+       // 386/sse2:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
+       a[0] /= a[1]
+}
+
 func Pow2Divs(n1 uint, n2 int) (uint, int) {
        // 386:"SHRL\t[$]5",-"DIVL"
        // amd64:"SHRQ\t[$]5",-"DIVQ"