From 90f2fa003738651b02eb0dcb957a6135772ff289 Mon Sep 17 00:00:00 2001 From: Ben Shi Date: Sun, 24 Jun 2018 07:04:21 +0000 Subject: [PATCH] cmd/compile: optimize 386 code with MULLload/DIVSSload/DIVSDload MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit IMULL/DIVSS/DIVSD all can take the source operand from memory directly. And this CL implement that optimization. 1. The total size of pkg/linux_386 decreases about 84KB (excluding cmd/compile). 2. The go1 benchmark shows little regression in total (excluding noise). name old time/op new time/op delta BinaryTree17-4 3.29s ± 2% 3.27s ± 4% ~ (p=0.192 n=30+30) Fannkuch11-4 3.49s ± 2% 3.54s ± 1% +1.48% (p=0.000 n=30+30) FmtFprintfEmpty-4 45.9ns ± 3% 46.3ns ± 4% +0.89% (p=0.037 n=30+30) FmtFprintfString-4 78.8ns ± 3% 78.7ns ± 4% ~ (p=0.209 n=30+27) FmtFprintfInt-4 91.0ns ± 2% 90.3ns ± 2% -0.82% (p=0.031 n=30+27) FmtFprintfIntInt-4 142ns ± 4% 143ns ± 4% ~ (p=0.136 n=30+30) FmtFprintfPrefixedInt-4 181ns ± 3% 183ns ± 4% +1.40% (p=0.005 n=30+30) FmtFprintfFloat-4 404ns ± 4% 408ns ± 3% ~ (p=0.397 n=30+30) FmtManyArgs-4 601ns ± 3% 609ns ± 5% ~ (p=0.059 n=30+30) GobDecode-4 7.21ms ± 5% 7.24ms ± 5% ~ (p=0.612 n=30+30) GobEncode-4 6.91ms ± 6% 6.91ms ± 6% ~ (p=0.797 n=30+30) Gzip-4 398ms ± 6% 399ms ± 4% ~ (p=0.173 n=30+30) Gunzip-4 41.7ms ± 3% 41.8ms ± 3% ~ (p=0.423 n=30+30) HTTPClientServer-4 62.3µs ± 2% 62.7µs ± 3% ~ (p=0.085 n=29+30) JSONEncode-4 21.0ms ± 4% 20.7ms ± 5% -1.39% (p=0.014 n=30+30) JSONDecode-4 66.3ms ± 3% 67.4ms ± 1% +1.71% (p=0.003 n=30+24) Mandelbrot200-4 5.15ms ± 3% 5.16ms ± 3% ~ (p=0.697 n=30+30) GoParse-4 3.24ms ± 3% 3.27ms ± 4% +0.91% (p=0.032 n=30+30) RegexpMatchEasy0_32-4 101ns ± 5% 99ns ± 4% -1.82% (p=0.008 n=29+30) RegexpMatchEasy0_1K-4 848ns ± 4% 841ns ± 2% -0.77% (p=0.043 n=30+30) RegexpMatchEasy1_32-4 106ns ± 6% 106ns ± 3% ~ (p=0.939 n=29+30) RegexpMatchEasy1_1K-4 1.02µs ± 3% 1.03µs ± 4% ~ (p=0.297 n=28+30) RegexpMatchMedium_32-4 129ns ± 4% 127ns ± 4% ~ (p=0.073 n=30+30) RegexpMatchMedium_1K-4 43.9µs ± 3% 43.8µs ± 3% ~ (p=0.186 n=30+30) RegexpMatchHard_32-4 2.24µs ± 4% 2.22µs ± 4% ~ (p=0.332 n=30+29) RegexpMatchHard_1K-4 68.0µs ± 4% 67.5µs ± 3% ~ (p=0.290 n=30+30) Revcomp-4 1.85s ± 3% 1.85s ± 3% ~ (p=0.358 n=30+30) Template-4 69.6ms ± 3% 70.0ms ± 4% ~ (p=0.273 n=30+30) TimeParse-4 445ns ± 3% 441ns ± 3% ~ (p=0.494 n=30+30) TimeFormat-4 412ns ± 3% 412ns ± 6% ~ (p=0.841 n=30+30) [Geo mean] 66.7µs 66.8µs +0.13% name old speed new speed delta GobDecode-4 107MB/s ± 5% 106MB/s ± 5% ~ (p=0.615 n=30+30) GobEncode-4 111MB/s ± 6% 111MB/s ± 6% ~ (p=0.790 n=30+30) Gzip-4 48.8MB/s ± 6% 48.7MB/s ± 4% ~ (p=0.167 n=30+30) Gunzip-4 465MB/s ± 3% 465MB/s ± 3% ~ (p=0.420 n=30+30) JSONEncode-4 92.4MB/s ± 4% 93.7MB/s ± 5% +1.42% (p=0.015 n=30+30) JSONDecode-4 29.3MB/s ± 3% 28.8MB/s ± 1% -1.72% (p=0.003 n=30+24) GoParse-4 17.9MB/s ± 3% 17.7MB/s ± 4% -0.89% (p=0.037 n=30+30) RegexpMatchEasy0_32-4 317MB/s ± 8% 324MB/s ± 4% +2.14% (p=0.006 n=30+30) RegexpMatchEasy0_1K-4 1.21GB/s ± 4% 1.22GB/s ± 2% +0.77% (p=0.036 n=30+30) RegexpMatchEasy1_32-4 298MB/s ± 7% 299MB/s ± 4% ~ (p=0.511 n=30+30) RegexpMatchEasy1_1K-4 1.00GB/s ± 3% 1.00GB/s ± 4% ~ (p=0.304 n=28+30) RegexpMatchMedium_32-4 7.75MB/s ± 4% 7.82MB/s ± 4% ~ (p=0.089 n=30+30) RegexpMatchMedium_1K-4 23.3MB/s ± 3% 23.4MB/s ± 3% ~ (p=0.181 n=30+30) RegexpMatchHard_32-4 14.3MB/s ± 4% 14.4MB/s ± 4% ~ (p=0.320 n=30+29) RegexpMatchHard_1K-4 15.1MB/s ± 4% 15.2MB/s ± 3% ~ (p=0.273 n=30+30) Revcomp-4 137MB/s ± 3% 137MB/s ± 3% ~ (p=0.352 n=30+30) Template-4 27.9MB/s ± 3% 27.7MB/s ± 4% ~ (p=0.277 n=30+30) [Geo mean] 79.9MB/s 80.1MB/s +0.15% Change-Id: I97333cd8ddabb3c7c88ca5aa9e14a005b74d306d Reviewed-on: https://go-review.googlesource.com/120695 Run-TryBot: Ben Shi TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/compile/internal/ssa/gen/386.rules | 30 +- src/cmd/compile/internal/ssa/gen/386Ops.go | 13 +- src/cmd/compile/internal/ssa/opGen.go | 58 ++++ src/cmd/compile/internal/ssa/rewrite386.go | 308 +++++++++++++++++++++ src/cmd/compile/internal/x86/ssa.go | 6 +- test/codegen/arithmetic.go | 12 + 6 files changed, 405 insertions(+), 22 deletions(-) diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules index 94f24a81ef..127829473b 100644 --- a/src/cmd/compile/internal/ssa/gen/386.rules +++ b/src/cmd/compile/internal/ssa/gen/386.rules @@ -636,12 +636,12 @@ (MOVSSstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSSstore [off1+off2] {sym} ptr val mem) (MOVSDstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSDstore [off1+off2] {sym} ptr val mem) -((ADD|SUB|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) -> - ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {sym} val base mem) -((ADD|SUB|MUL)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) -> - ((ADD|SUB|MUL)SSload [off1+off2] {sym} val base mem) -((ADD|SUB|MUL)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) -> - ((ADD|SUB|MUL)SDload [off1+off2] {sym} val base mem) +((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) -> + ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {sym} val base mem) +((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) -> + ((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem) +((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) -> + ((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem) ((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDLconst [off2] base) val mem) && is32Bit(off1+off2) -> ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem) @@ -757,15 +757,15 @@ (MOVSDstore [off1] {sym1} (LEAL8 [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> (MOVSDstoreidx8 [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem) -((ADD|SUB|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) +((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) -> - ((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem) -((ADD|SUB|MUL)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) -> - ((ADD|SUB|MUL)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem) -((ADD|SUB|MUL)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + ((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem) +((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) -> - ((ADD|SUB|MUL)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem) + ((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem) ((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) -> ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) @@ -846,9 +846,9 @@ (MOVSDstoreidx8 [c] {sym} ptr (ADDLconst [d] idx) val mem) -> (MOVSDstoreidx8 [int64(int32(c+8*d))] {sym} ptr idx val mem) // Merge load/store to op -((ADD|AND|OR|XOR|SUB)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|AND|OR|XOR|SUB)Lload x [off] {sym} ptr mem) -((ADD|SUB|MUL)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL)SDload x [off] {sym} ptr mem) -((ADD|SUB|MUL)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL)SSload x [off] {sym} ptr mem) +((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem) +((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem) +((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem) (MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) -> ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) (MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y) && clobber(l) -> ((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) diff --git a/src/cmd/compile/internal/ssa/gen/386Ops.go b/src/cmd/compile/internal/ssa/gen/386Ops.go index 7a274269d2..40f4a2b15e 100644 --- a/src/cmd/compile/internal/ssa/gen/386Ops.go +++ b/src/cmd/compile/internal/ssa/gen/386Ops.go @@ -183,6 +183,8 @@ func init() { {name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem {name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem {name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem // binary ops {name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true}, // arg0 + arg1 @@ -279,11 +281,12 @@ func init() { {name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15 {name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-7 - {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem - {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem - {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+auxint+aux, arg2 = mem - {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+auxint+aux, arg2 = mem - {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "MULLload", argLength: 3, reg: gp21load, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+auxint+aux, arg2 = mem + {name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+auxint+aux, arg2 = mem // unary ops {name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0 diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 3554623840..8ac47cb2d0 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -262,6 +262,8 @@ const ( Op386SUBSDload Op386MULSSload Op386MULSDload + Op386DIVSSload + Op386DIVSDload Op386ADDL Op386ADDLconst Op386ADDLcarry @@ -333,6 +335,7 @@ const ( Op386ROLBconst Op386ADDLload Op386SUBLload + Op386MULLload Op386ANDLload Op386ORLload Op386XORLload @@ -2752,6 +2755,42 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "DIVSSload", + auxType: auxSymOff, + argLen: 3, + resultInArg0: true, + faultOnNilArg1: true, + symEffect: SymRead, + asm: x86.ADIVSS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7 + {1, 65791}, // AX CX DX BX SP BP SI DI SB + }, + outputs: []outputInfo{ + {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7 + }, + }, + }, + { + name: "DIVSDload", + auxType: auxSymOff, + argLen: 3, + resultInArg0: true, + faultOnNilArg1: true, + symEffect: SymRead, + asm: x86.ADIVSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7 + {1, 65791}, // AX CX DX BX SP BP SI DI SB + }, + outputs: []outputInfo{ + {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7 + }, + }, + }, { name: "ADDL", argLen: 2, @@ -3821,6 +3860,25 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "MULLload", + auxType: auxSymOff, + argLen: 3, + resultInArg0: true, + clobberFlags: true, + faultOnNilArg1: true, + symEffect: SymRead, + asm: x86.AIMULL, + reg: regInfo{ + inputs: []inputInfo{ + {0, 239}, // AX CX DX BX BP SI DI + {1, 65791}, // AX CX DX BX SP BP SI DI SB + }, + outputs: []outputInfo{ + {0, 239}, // AX CX DX BX BP SI DI + }, + }, + }, { name: "ANDLload", auxType: auxSymOff, diff --git a/src/cmd/compile/internal/ssa/rewrite386.go b/src/cmd/compile/internal/ssa/rewrite386.go index db2c62089d..039538ea7d 100644 --- a/src/cmd/compile/internal/ssa/rewrite386.go +++ b/src/cmd/compile/internal/ssa/rewrite386.go @@ -61,6 +61,14 @@ func rewriteValue386(v *Value) bool { return rewriteValue386_Op386CMPWconst_0(v) case Op386CMPWload: return rewriteValue386_Op386CMPWload_0(v) + case Op386DIVSD: + return rewriteValue386_Op386DIVSD_0(v) + case Op386DIVSDload: + return rewriteValue386_Op386DIVSDload_0(v) + case Op386DIVSS: + return rewriteValue386_Op386DIVSS_0(v) + case Op386DIVSSload: + return rewriteValue386_Op386DIVSSload_0(v) case Op386LEAL: return rewriteValue386_Op386LEAL_0(v) case Op386LEAL1: @@ -163,6 +171,8 @@ func rewriteValue386(v *Value) bool { return rewriteValue386_Op386MULL_0(v) case Op386MULLconst: return rewriteValue386_Op386MULLconst_0(v) || rewriteValue386_Op386MULLconst_10(v) || rewriteValue386_Op386MULLconst_20(v) || rewriteValue386_Op386MULLconst_30(v) + case Op386MULLload: + return rewriteValue386_Op386MULLload_0(v) case Op386MULSD: return rewriteValue386_Op386MULSD_0(v) case Op386MULSDload: @@ -3098,6 +3108,192 @@ func rewriteValue386_Op386CMPWload_0(v *Value) bool { } return false } +func rewriteValue386_Op386DIVSD_0(v *Value) bool { + b := v.Block + _ = b + config := b.Func.Config + _ = config + // match: (DIVSD x l:(MOVSDload [off] {sym} ptr mem)) + // cond: canMergeLoad(v, l, x) && !config.use387 && clobber(l) + // result: (DIVSDload x [off] {sym} ptr mem) + for { + _ = v.Args[1] + x := v.Args[0] + l := v.Args[1] + if l.Op != Op386MOVSDload { + break + } + off := l.AuxInt + sym := l.Aux + _ = l.Args[1] + ptr := l.Args[0] + mem := l.Args[1] + if !(canMergeLoad(v, l, x) && !config.use387 && clobber(l)) { + break + } + v.reset(Op386DIVSDload) + v.AuxInt = off + v.Aux = sym + v.AddArg(x) + v.AddArg(ptr) + v.AddArg(mem) + return true + } + return false +} +func rewriteValue386_Op386DIVSDload_0(v *Value) bool { + b := v.Block + _ = b + config := b.Func.Config + _ = config + // match: (DIVSDload [off1] {sym} val (ADDLconst [off2] base) mem) + // cond: is32Bit(off1+off2) + // result: (DIVSDload [off1+off2] {sym} val base mem) + for { + off1 := v.AuxInt + sym := v.Aux + _ = v.Args[2] + val := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != Op386ADDLconst { + break + } + off2 := v_1.AuxInt + base := v_1.Args[0] + mem := v.Args[2] + if !(is32Bit(off1 + off2)) { + break + } + v.reset(Op386DIVSDload) + v.AuxInt = off1 + off2 + v.Aux = sym + v.AddArg(val) + v.AddArg(base) + v.AddArg(mem) + return true + } + // match: (DIVSDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) + // result: (DIVSDload [off1+off2] {mergeSym(sym1,sym2)} val base mem) + for { + off1 := v.AuxInt + sym1 := v.Aux + _ = v.Args[2] + val := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != Op386LEAL { + break + } + off2 := v_1.AuxInt + sym2 := v_1.Aux + base := v_1.Args[0] + mem := v.Args[2] + if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)) { + break + } + v.reset(Op386DIVSDload) + v.AuxInt = off1 + off2 + v.Aux = mergeSym(sym1, sym2) + v.AddArg(val) + v.AddArg(base) + v.AddArg(mem) + return true + } + return false +} +func rewriteValue386_Op386DIVSS_0(v *Value) bool { + b := v.Block + _ = b + config := b.Func.Config + _ = config + // match: (DIVSS x l:(MOVSSload [off] {sym} ptr mem)) + // cond: canMergeLoad(v, l, x) && !config.use387 && clobber(l) + // result: (DIVSSload x [off] {sym} ptr mem) + for { + _ = v.Args[1] + x := v.Args[0] + l := v.Args[1] + if l.Op != Op386MOVSSload { + break + } + off := l.AuxInt + sym := l.Aux + _ = l.Args[1] + ptr := l.Args[0] + mem := l.Args[1] + if !(canMergeLoad(v, l, x) && !config.use387 && clobber(l)) { + break + } + v.reset(Op386DIVSSload) + v.AuxInt = off + v.Aux = sym + v.AddArg(x) + v.AddArg(ptr) + v.AddArg(mem) + return true + } + return false +} +func rewriteValue386_Op386DIVSSload_0(v *Value) bool { + b := v.Block + _ = b + config := b.Func.Config + _ = config + // match: (DIVSSload [off1] {sym} val (ADDLconst [off2] base) mem) + // cond: is32Bit(off1+off2) + // result: (DIVSSload [off1+off2] {sym} val base mem) + for { + off1 := v.AuxInt + sym := v.Aux + _ = v.Args[2] + val := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != Op386ADDLconst { + break + } + off2 := v_1.AuxInt + base := v_1.Args[0] + mem := v.Args[2] + if !(is32Bit(off1 + off2)) { + break + } + v.reset(Op386DIVSSload) + v.AuxInt = off1 + off2 + v.Aux = sym + v.AddArg(val) + v.AddArg(base) + v.AddArg(mem) + return true + } + // match: (DIVSSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) + // result: (DIVSSload [off1+off2] {mergeSym(sym1,sym2)} val base mem) + for { + off1 := v.AuxInt + sym1 := v.Aux + _ = v.Args[2] + val := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != Op386LEAL { + break + } + off2 := v_1.AuxInt + sym2 := v_1.Aux + base := v_1.Args[0] + mem := v.Args[2] + if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)) { + break + } + v.reset(Op386DIVSSload) + v.AuxInt = off1 + off2 + v.Aux = mergeSym(sym1, sym2) + v.AddArg(val) + v.AddArg(base) + v.AddArg(mem) + return true + } + return false +} func rewriteValue386_Op386LEAL_0(v *Value) bool { // match: (LEAL [c] {s} (ADDLconst [d] x)) // cond: is32Bit(c+d) @@ -9825,6 +10021,58 @@ func rewriteValue386_Op386MULL_0(v *Value) bool { v.AddArg(x) return true } + // match: (MULL x l:(MOVLload [off] {sym} ptr mem)) + // cond: canMergeLoad(v, l, x) && clobber(l) + // result: (MULLload x [off] {sym} ptr mem) + for { + _ = v.Args[1] + x := v.Args[0] + l := v.Args[1] + if l.Op != Op386MOVLload { + break + } + off := l.AuxInt + sym := l.Aux + _ = l.Args[1] + ptr := l.Args[0] + mem := l.Args[1] + if !(canMergeLoad(v, l, x) && clobber(l)) { + break + } + v.reset(Op386MULLload) + v.AuxInt = off + v.Aux = sym + v.AddArg(x) + v.AddArg(ptr) + v.AddArg(mem) + return true + } + // match: (MULL l:(MOVLload [off] {sym} ptr mem) x) + // cond: canMergeLoad(v, l, x) && clobber(l) + // result: (MULLload x [off] {sym} ptr mem) + for { + _ = v.Args[1] + l := v.Args[0] + if l.Op != Op386MOVLload { + break + } + off := l.AuxInt + sym := l.Aux + _ = l.Args[1] + ptr := l.Args[0] + mem := l.Args[1] + x := v.Args[1] + if !(canMergeLoad(v, l, x) && clobber(l)) { + break + } + v.reset(Op386MULLload) + v.AuxInt = off + v.Aux = sym + v.AddArg(x) + v.AddArg(ptr) + v.AddArg(mem) + return true + } return false } func rewriteValue386_Op386MULLconst_0(v *Value) bool { @@ -10332,6 +10580,66 @@ func rewriteValue386_Op386MULLconst_30(v *Value) bool { } return false } +func rewriteValue386_Op386MULLload_0(v *Value) bool { + b := v.Block + _ = b + config := b.Func.Config + _ = config + // match: (MULLload [off1] {sym} val (ADDLconst [off2] base) mem) + // cond: is32Bit(off1+off2) + // result: (MULLload [off1+off2] {sym} val base mem) + for { + off1 := v.AuxInt + sym := v.Aux + _ = v.Args[2] + val := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != Op386ADDLconst { + break + } + off2 := v_1.AuxInt + base := v_1.Args[0] + mem := v.Args[2] + if !(is32Bit(off1 + off2)) { + break + } + v.reset(Op386MULLload) + v.AuxInt = off1 + off2 + v.Aux = sym + v.AddArg(val) + v.AddArg(base) + v.AddArg(mem) + return true + } + // match: (MULLload [off1] {sym1} val (LEAL [off2] {sym2} base) mem) + // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) + // result: (MULLload [off1+off2] {mergeSym(sym1,sym2)} val base mem) + for { + off1 := v.AuxInt + sym1 := v.Aux + _ = v.Args[2] + val := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != Op386LEAL { + break + } + off2 := v_1.AuxInt + sym2 := v_1.Aux + base := v_1.Args[0] + mem := v.Args[2] + if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared)) { + break + } + v.reset(Op386MULLload) + v.AuxInt = off1 + off2 + v.Aux = mergeSym(sym1, sym2) + v.AddArg(val) + v.AddArg(base) + v.AddArg(mem) + return true + } + return false +} func rewriteValue386_Op386MULSD_0(v *Value) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/x86/ssa.go b/src/cmd/compile/internal/x86/ssa.go index d75a55c565..7cdff863b2 100644 --- a/src/cmd/compile/internal/x86/ssa.go +++ b/src/cmd/compile/internal/x86/ssa.go @@ -525,8 +525,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() - case ssa.Op386ADDLload, ssa.Op386SUBLload, ssa.Op386ANDLload, ssa.Op386ORLload, ssa.Op386XORLload, - ssa.Op386ADDSDload, ssa.Op386ADDSSload, ssa.Op386SUBSDload, ssa.Op386SUBSSload, ssa.Op386MULSDload, ssa.Op386MULSSload: + case ssa.Op386ADDLload, ssa.Op386SUBLload, ssa.Op386MULLload, + ssa.Op386ANDLload, ssa.Op386ORLload, ssa.Op386XORLload, + ssa.Op386ADDSDload, ssa.Op386ADDSSload, ssa.Op386SUBSDload, ssa.Op386SUBSSload, + ssa.Op386MULSDload, ssa.Op386MULSSload, ssa.Op386DIVSSload, ssa.Op386DIVSDload: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = v.Args[1].Reg() diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go index a7f2906db9..3c063d8736 100644 --- a/test/codegen/arithmetic.go +++ b/test/codegen/arithmetic.go @@ -49,6 +49,13 @@ func Mul_96(n int) int { return n * 96 } +func MulMemSrc(a []uint32, b []float32) { + // 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+` + a[0] *= a[1] + // 386/sse2:`MULSS\s4\([A-Z]+\),\sX[0-9]+` + b[0] *= b[1] +} + // Multiplications merging tests func MergeMuls1(n int) int { @@ -85,6 +92,11 @@ func MergeMuls5(a, n int) int { // Division // // -------------- // +func DivMemSrc(a []float64) { + // 386/sse2:`DIVSD\s8\([A-Z]+\),\sX[0-9]+` + a[0] /= a[1] +} + func Pow2Divs(n1 uint, n2 int) (uint, int) { // 386:"SHRL\t[$]5",-"DIVL" // amd64:"SHRQ\t[$]5",-"DIVQ" -- 2.48.1