From 4a33af6bb63eaa69a4a2cc0d4f222d37d7531b9c Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Mon, 18 Jul 2016 15:52:59 -0700 Subject: [PATCH] [dev.ssa] cmd/compile: more 386 port changes Fix up zero/move code, including duff calls and rep movs. Handle the new ops generated by dec64.rules. Fix constant shifts. Change-Id: I7d89194b29b04311bfafa0fd93b9f5644af04df9 Reviewed-on: https://go-review.googlesource.com/25033 Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot Reviewed-by: David Chase --- src/cmd/compile/internal/ssa/gen/386.rules | 79 ++- src/cmd/compile/internal/ssa/gen/386Ops.go | 9 +- src/cmd/compile/internal/ssa/gen/AMD64.rules | 4 +- src/cmd/compile/internal/ssa/gen/dec64.rules | 2 +- src/cmd/compile/internal/ssa/opGen.go | 44 +- src/cmd/compile/internal/ssa/rewrite.go | 45 +- src/cmd/compile/internal/ssa/rewrite386.go | 606 ++++++++++++++----- src/cmd/compile/internal/ssa/rewriteAMD64.go | 6 +- src/cmd/compile/internal/x86/ssa.go | 29 +- 9 files changed, 550 insertions(+), 274 deletions(-) diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules index 6569533b77..0587be4367 100644 --- a/src/cmd/compile/internal/ssa/gen/386.rules +++ b/src/cmd/compile/internal/ssa/gen/386.rules @@ -83,8 +83,7 @@ (Not x) -> (XORLconst [1] x) // Lowering pointer arithmetic -(OffPtr [off] ptr) && is32Bit(off) -> (ADDLconst [off] ptr) -(OffPtr [off] ptr) -> (ADDL (MOVLconst [off]) ptr) +(OffPtr [off] ptr) -> (ADDLconst [off] ptr) (Bswap32 x) -> (BSWAPL x) @@ -99,6 +98,9 @@ (ZeroExt8to32 x) -> (MOVBLZX x) (ZeroExt16to32 x) -> (MOVWLZX x) +(Signmask x) -> (SARLconst x [31]) +(Zeromask x) -> (SBBLcarrymask (CMPL (MOVLconst [0]) x)) + // Lowering truncation // Because we ignore high parts of registers, truncates are just copies. (Trunc16to8 x) -> x @@ -161,6 +163,26 @@ (Rsh8x16 x y) -> (SARB x (ORL y (NOTL (SBBLcarrymask (CMPWconst y [8]))))) (Rsh8x8 x y) -> (SARB x (ORL y (NOTL (SBBLcarrymask (CMPBconst y [8]))))) +// constant shifts +// generic opt rewrites all constant shifts to shift by Const64 +(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SHLLconst x [c]) +(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SARLconst x [c]) +(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 -> (SHRLconst x [c]) +(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SHLLconst x [c]) +(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SARWconst x [c]) +(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 -> (SHRWconst x [c]) +(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SHLLconst x [c]) +(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SARBconst x [c]) +(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 -> (SHRBconst x [c]) + +// large constant shifts +(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0]) +(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0]) +(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0]) +(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0]) +(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0]) +(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0]) + // Lowering comparisons (Less32 x y) -> (SETL (CMPL x y)) (Less16 x y) -> (SETL (CMPW x y)) @@ -241,7 +263,6 @@ (Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem) (Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstore dst (MOVWload src mem) mem) (Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstore dst (MOVLload src mem) mem) -(Move [s] dst src mem) && SizeAndAlign(s).Size() == 16 -> (MOVOstore dst (MOVOload src mem) mem) (Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 -> (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem)) @@ -254,21 +275,32 @@ (Move [s] dst src mem) && SizeAndAlign(s).Size() == 7 -> (MOVLstore [3] dst (MOVLload [3] src mem) (MOVLstore dst (MOVLload src mem) mem)) +(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 -> + (MOVLstore [4] dst (MOVLload [4] src mem) + (MOVLstore dst (MOVLload src mem) mem)) + +// Adjust moves to be a multiple of 4 bytes. +(Move [s] dst src mem) + && SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0 -> + (Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%4] + (ADDLconst dst [SizeAndAlign(s).Size()%4]) + (ADDLconst src [SizeAndAlign(s).Size()%4]) + (MOVLstore dst (MOVLload src mem) mem)) // Medium copying uses a duff device. (Move [s] dst src mem) - && SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0 + && SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 && !config.noDuffDevice -> - (DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem) -// 14 and 64 are magic constants. 14 is the number of bytes to encode: -// MOVUPS (SI), X0 -// ADDL $16, SI -// MOVUPS X0, (DI) -// ADDL $16, DI -// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy. + (DUFFCOPY [10*(128-SizeAndAlign(s).Size()/4)] dst src mem) +// 10 and 128 are magic constants. 10 is the number of bytes to encode: +// MOVL (SI), CX +// ADDL $4, SI +// MOVL CX, (DI) +// ADDL $4, DI +// and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy. // Large copying uses REP MOVSL. -(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0 -> +(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0 -> (REPMOVSL dst src (MOVLconst [SizeAndAlign(s).Size()/4]) mem) // Lowering Zero instructions @@ -309,11 +341,22 @@ (MOVLstoreconst [makeValAndOff(0,4)] destptr (MOVLstoreconst [0] destptr mem)))) +// Medium zeroing uses a duff device. +(Zero [s] destptr mem) + && SizeAndAlign(s).Size() > 16 + && SizeAndAlign(s).Size() <= 4*128 + && SizeAndAlign(s).Size()%4 == 0 + && !config.noDuffDevice -> + (DUFFZERO [1*(128-SizeAndAlign(s).Size()/4)] destptr (MOVLconst [0]) mem) +// 1 and 128 are magic constants. 1 is the number of bytes to encode STOSL. +// 128 is the number of STOSL instructions in duffzero. +// See src/runtime/duff_386.s:duffzero. + // Large zeroing uses REP STOSQ. (Zero [s] destptr mem) - && (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32)) - && SizeAndAlign(s).Size()%8 == 0 -> - (REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/8]) (MOVLconst [0]) mem) + && (SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16)) + && SizeAndAlign(s).Size()%4 == 0 -> + (REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/4]) (MOVLconst [0]) mem) // Lowering constants (Const8 [val]) -> (MOVLconst [val]) @@ -596,14 +639,12 @@ (MOVBload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVBload [off1+off2] {sym} ptr mem) (MOVSSload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSSload [off1+off2] {sym} ptr mem) (MOVSDload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSDload [off1+off2] {sym} ptr mem) -(MOVOload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVOload [off1+off2] {sym} ptr mem) (MOVLstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVLstore [off1+off2] {sym} ptr val mem) (MOVWstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVWstore [off1+off2] {sym} ptr val mem) (MOVBstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVBstore [off1+off2] {sym} ptr val mem) (MOVSSstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSSstore [off1+off2] {sym} ptr val mem) (MOVSDstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSDstore [off1+off2] {sym} ptr val mem) -(MOVOstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVOstore [off1+off2] {sym} ptr val mem) // Fold constants into stores. (MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) -> @@ -633,8 +674,6 @@ (MOVSSload [off1+off2] {mergeSym(sym1,sym2)} base mem) (MOVSDload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> (MOVSDload [off1+off2] {mergeSym(sym1,sym2)} base mem) -(MOVOload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> - (MOVOload [off1+off2] {mergeSym(sym1,sym2)} base mem) (MOVBLSXload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> (MOVBLSXload [off1+off2] {mergeSym(sym1,sym2)} base mem) @@ -651,8 +690,6 @@ (MOVSSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) (MOVSDstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> (MOVSDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) -(MOVOstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> - (MOVOstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) (MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) -> (MOVLstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem) diff --git a/src/cmd/compile/internal/ssa/gen/386Ops.go b/src/cmd/compile/internal/ssa/gen/386Ops.go index 68bcfa9649..49c4cd49e4 100644 --- a/src/cmd/compile/internal/ssa/gen/386Ops.go +++ b/src/cmd/compile/internal/ssa/gen/386Ops.go @@ -330,8 +330,6 @@ func init() { {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem {name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem - {name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128"}, // load 16 bytes from arg0+auxint+aux. arg1=mem - {name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem"}, // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem // indexed loads/stores {name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem @@ -360,7 +358,7 @@ func init() { {name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ... {name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ... - // arg0 = (duff-adjusted) pointer to start of memory to zero + // arg0 = pointer to start of memory to zero // arg1 = value to store (will always be zero) // arg2 = mem // auxint = offset into duffzero code to start executing @@ -370,11 +368,10 @@ func init() { aux: "Int64", argLength: 3, reg: regInfo{ - inputs: []regMask{buildReg("DI"), buildReg("X0")}, + inputs: []regMask{buildReg("DI"), buildReg("AX")}, clobbers: buildReg("DI FLAGS"), }, }, - {name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true}, // arg0 = address of memory to zero // arg1 = # of 4-byte words to zero @@ -407,7 +404,7 @@ func init() { argLength: 3, reg: regInfo{ inputs: []regMask{buildReg("DI"), buildReg("SI")}, - clobbers: buildReg("DI SI X0 FLAGS"), // uses X0 as a temporary + clobbers: buildReg("DI SI CX FLAGS"), // uses CX as a temporary }, }, diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index b429b6f627..811e810f15 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -400,8 +400,8 @@ (Zero [SizeAndAlign(s).Size()-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem)) (Zero [s] destptr mem) && SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice -> - (DUFFZERO [duffStart(SizeAndAlign(s).Size())] - (ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) + (DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())] + (ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) mem) // Large zeroing uses REP STOSQ. diff --git a/src/cmd/compile/internal/ssa/gen/dec64.rules b/src/cmd/compile/internal/ssa/gen/dec64.rules index 47e2933872..8b2fd27669 100644 --- a/src/cmd/compile/internal/ssa/gen/dec64.rules +++ b/src/cmd/compile/internal/ssa/gen/dec64.rules @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// This file contains rules to decompose [u]int32 types on 32-bit +// This file contains rules to decompose [u]int64 types on 32-bit // architectures. These rules work together with the decomposeBuiltIn // pass which handles phis of these types. diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 47cfda86b5..a09e736b79 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -285,8 +285,6 @@ const ( Op386MOVBstore Op386MOVWstore Op386MOVLstore - Op386MOVOload - Op386MOVOstore Op386MOVBloadidx1 Op386MOVWloadidx1 Op386MOVWloadidx2 @@ -306,7 +304,6 @@ const ( Op386MOVLstoreconstidx1 Op386MOVLstoreconstidx4 Op386DUFFZERO - Op386MOVOconst Op386REPSTOSL Op386CALLstatic Op386CALLclosure @@ -3152,32 +3149,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "MOVOload", - auxType: auxSymOff, - argLen: 2, - asm: x86.AMOVUPS, - reg: regInfo{ - inputs: []inputInfo{ - {0, 65791}, // AX CX DX BX SP BP SI DI SB - }, - outputs: []outputInfo{ - {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7 - }, - }, - }, - { - name: "MOVOstore", - auxType: auxSymOff, - argLen: 3, - asm: x86.AMOVUPS, - reg: regInfo{ - inputs: []inputInfo{ - {1, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7 - {0, 65791}, // AX CX DX BX SP BP SI DI SB - }, - }, - }, { name: "MOVBloadidx1", auxType: auxSymOff, @@ -3418,22 +3389,11 @@ var opcodeTable = [...]opInfo{ reg: regInfo{ inputs: []inputInfo{ {0, 128}, // DI - {1, 256}, // X0 + {1, 1}, // AX }, clobbers: 131200, // DI FLAGS }, }, - { - name: "MOVOconst", - auxType: auxInt128, - argLen: 0, - rematerializeable: true, - reg: regInfo{ - outputs: []outputInfo{ - {0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7 - }, - }, - }, { name: "REPSTOSL", argLen: 4, @@ -3502,7 +3462,7 @@ var opcodeTable = [...]opInfo{ {0, 128}, // DI {1, 64}, // SI }, - clobbers: 131520, // SI DI X0 FLAGS + clobbers: 131266, // CX SI DI FLAGS }, }, { diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 03c38827cc..09798eb1bd 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -254,39 +254,38 @@ func isSamePtr(p1, p2 *Value) bool { return false } -// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD, -// See runtime/mkduff.go. -const ( - dzBlocks = 16 // number of MOV/ADD blocks - dzBlockLen = 4 // number of clears per block - dzBlockSize = 19 // size of instructions in a single block - dzMovSize = 4 // size of single MOV instruction w/ offset - dzAddSize = 4 // size of single ADD instruction - dzClearStep = 16 // number of bytes cleared by each MOV instruction - - dzTailLen = 4 // number of final STOSQ instructions - dzTailSize = 2 // size of single STOSQ instruction - - dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block - dzSize = dzBlocks * dzBlockSize -) - -func duffStart(size int64) int64 { - x, _ := duff(size) +func duffStartAMD64(size int64) int64 { + x, _ := duffAMD64(size) return x } -func duffAdj(size int64) int64 { - _, x := duff(size) +func duffAdjAMD64(size int64) int64 { + _, x := duffAMD64(size) return x } // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes) // required to use the duffzero mechanism for a block of the given size. -func duff(size int64) (int64, int64) { +func duffAMD64(size int64) (int64, int64) { + // DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD, + // See runtime/mkduff.go. + const ( + dzBlocks = 16 // number of MOV/ADD blocks + dzBlockLen = 4 // number of clears per block + dzBlockSize = 19 // size of instructions in a single block + dzMovSize = 4 // size of single MOV instruction w/ offset + dzAddSize = 4 // size of single ADD instruction + dzClearStep = 16 // number of bytes cleared by each MOV instruction + + dzTailLen = 4 // number of final STOSQ instructions + dzTailSize = 2 // size of single STOSQ instruction + + dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block + dzSize = dzBlocks * dzBlockSize + ) + if size < 32 || size > 1024 || size%dzClearStep != 0 { panic("bad duffzero size") } - // TODO: arch-dependent steps := size / dzClearStep blocks := steps / dzBlockLen steps %= dzBlockLen diff --git a/src/cmd/compile/internal/ssa/rewrite386.go b/src/cmd/compile/internal/ssa/rewrite386.go index f3f021493d..5d571c588f 100644 --- a/src/cmd/compile/internal/ssa/rewrite386.go +++ b/src/cmd/compile/internal/ssa/rewrite386.go @@ -240,18 +240,24 @@ func rewriteValue386(v *Value, config *Config) bool { return rewriteValue386_OpLsh16x16(v, config) case OpLsh16x32: return rewriteValue386_OpLsh16x32(v, config) + case OpLsh16x64: + return rewriteValue386_OpLsh16x64(v, config) case OpLsh16x8: return rewriteValue386_OpLsh16x8(v, config) case OpLsh32x16: return rewriteValue386_OpLsh32x16(v, config) case OpLsh32x32: return rewriteValue386_OpLsh32x32(v, config) + case OpLsh32x64: + return rewriteValue386_OpLsh32x64(v, config) case OpLsh32x8: return rewriteValue386_OpLsh32x8(v, config) case OpLsh8x16: return rewriteValue386_OpLsh8x16(v, config) case OpLsh8x32: return rewriteValue386_OpLsh8x32(v, config) + case OpLsh8x64: + return rewriteValue386_OpLsh8x64(v, config) case OpLsh8x8: return rewriteValue386_OpLsh8x8(v, config) case Op386MOVBLSX: @@ -290,10 +296,6 @@ func rewriteValue386(v *Value, config *Config) bool { return rewriteValue386_Op386MOVLstoreidx1(v, config) case Op386MOVLstoreidx4: return rewriteValue386_Op386MOVLstoreidx4(v, config) - case Op386MOVOload: - return rewriteValue386_Op386MOVOload(v, config) - case Op386MOVOstore: - return rewriteValue386_Op386MOVOstore(v, config) case Op386MOVSDload: return rewriteValue386_Op386MOVSDload(v, config) case Op386MOVSDloadidx1: @@ -428,36 +430,48 @@ func rewriteValue386(v *Value, config *Config) bool { return rewriteValue386_OpRsh16Ux16(v, config) case OpRsh16Ux32: return rewriteValue386_OpRsh16Ux32(v, config) + case OpRsh16Ux64: + return rewriteValue386_OpRsh16Ux64(v, config) case OpRsh16Ux8: return rewriteValue386_OpRsh16Ux8(v, config) case OpRsh16x16: return rewriteValue386_OpRsh16x16(v, config) case OpRsh16x32: return rewriteValue386_OpRsh16x32(v, config) + case OpRsh16x64: + return rewriteValue386_OpRsh16x64(v, config) case OpRsh16x8: return rewriteValue386_OpRsh16x8(v, config) case OpRsh32Ux16: return rewriteValue386_OpRsh32Ux16(v, config) case OpRsh32Ux32: return rewriteValue386_OpRsh32Ux32(v, config) + case OpRsh32Ux64: + return rewriteValue386_OpRsh32Ux64(v, config) case OpRsh32Ux8: return rewriteValue386_OpRsh32Ux8(v, config) case OpRsh32x16: return rewriteValue386_OpRsh32x16(v, config) case OpRsh32x32: return rewriteValue386_OpRsh32x32(v, config) + case OpRsh32x64: + return rewriteValue386_OpRsh32x64(v, config) case OpRsh32x8: return rewriteValue386_OpRsh32x8(v, config) case OpRsh8Ux16: return rewriteValue386_OpRsh8Ux16(v, config) case OpRsh8Ux32: return rewriteValue386_OpRsh8Ux32(v, config) + case OpRsh8Ux64: + return rewriteValue386_OpRsh8Ux64(v, config) case OpRsh8Ux8: return rewriteValue386_OpRsh8Ux8(v, config) case OpRsh8x16: return rewriteValue386_OpRsh8x16(v, config) case OpRsh8x32: return rewriteValue386_OpRsh8x32(v, config) + case OpRsh8x64: + return rewriteValue386_OpRsh8x64(v, config) case OpRsh8x8: return rewriteValue386_OpRsh8x8(v, config) case Op386SARB: @@ -516,6 +530,8 @@ func rewriteValue386(v *Value, config *Config) bool { return rewriteValue386_OpSignExt8to16(v, config) case OpSignExt8to32: return rewriteValue386_OpSignExt8to32(v, config) + case OpSignmask: + return rewriteValue386_OpSignmask(v, config) case OpSqrt: return rewriteValue386_OpSqrt(v, config) case OpStaticCall: @@ -562,6 +578,8 @@ func rewriteValue386(v *Value, config *Config) bool { return rewriteValue386_OpZeroExt8to16(v, config) case OpZeroExt8to32: return rewriteValue386_OpZeroExt8to32(v, config) + case OpZeromask: + return rewriteValue386_OpZeromask(v, config) } return false } @@ -4062,6 +4080,45 @@ func rewriteValue386_OpLsh16x32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpLsh16x64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Lsh16x64 x (Const64 [c])) + // cond: uint64(c) < 16 + // result: (SHLLconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 16) { + break + } + v.reset(Op386SHLLconst) + v.AddArg(x) + v.AuxInt = c + return true + } + // match: (Lsh16x64 _ (Const64 [c])) + // cond: uint64(c) >= 16 + // result: (Const16 [0]) + for { + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) >= 16) { + break + } + v.reset(OpConst16) + v.AuxInt = 0 + return true + } + return false +} func rewriteValue386_OpLsh16x8(v *Value, config *Config) bool { b := v.Block _ = b @@ -4134,6 +4191,45 @@ func rewriteValue386_OpLsh32x32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpLsh32x64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Lsh32x64 x (Const64 [c])) + // cond: uint64(c) < 32 + // result: (SHLLconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 32) { + break + } + v.reset(Op386SHLLconst) + v.AddArg(x) + v.AuxInt = c + return true + } + // match: (Lsh32x64 _ (Const64 [c])) + // cond: uint64(c) >= 32 + // result: (Const32 [0]) + for { + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) >= 32) { + break + } + v.reset(OpConst32) + v.AuxInt = 0 + return true + } + return false +} func rewriteValue386_OpLsh32x8(v *Value, config *Config) bool { b := v.Block _ = b @@ -4206,6 +4302,45 @@ func rewriteValue386_OpLsh8x32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpLsh8x64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Lsh8x64 x (Const64 [c])) + // cond: uint64(c) < 8 + // result: (SHLLconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 8) { + break + } + v.reset(Op386SHLLconst) + v.AddArg(x) + v.AuxInt = c + return true + } + // match: (Lsh8x64 _ (Const64 [c])) + // cond: uint64(c) >= 8 + // result: (Const8 [0]) + for { + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) >= 8) { + break + } + v.reset(OpConst8) + v.AuxInt = 0 + return true + } + return false +} func rewriteValue386_OpLsh8x8(v *Value, config *Config) bool { b := v.Block _ = b @@ -5997,114 +6132,6 @@ func rewriteValue386_Op386MOVLstoreidx4(v *Value, config *Config) bool { } return false } -func rewriteValue386_Op386MOVOload(v *Value, config *Config) bool { - b := v.Block - _ = b - // match: (MOVOload [off1] {sym} (ADDLconst [off2] ptr) mem) - // cond: is32Bit(off1+off2) - // result: (MOVOload [off1+off2] {sym} ptr mem) - for { - off1 := v.AuxInt - sym := v.Aux - v_0 := v.Args[0] - if v_0.Op != Op386ADDLconst { - break - } - off2 := v_0.AuxInt - ptr := v_0.Args[0] - mem := v.Args[1] - if !(is32Bit(off1 + off2)) { - break - } - v.reset(Op386MOVOload) - v.AuxInt = off1 + off2 - v.Aux = sym - v.AddArg(ptr) - v.AddArg(mem) - return true - } - // match: (MOVOload [off1] {sym1} (LEAL [off2] {sym2} base) mem) - // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) - // result: (MOVOload [off1+off2] {mergeSym(sym1,sym2)} base mem) - for { - off1 := v.AuxInt - sym1 := v.Aux - v_0 := v.Args[0] - if v_0.Op != Op386LEAL { - break - } - off2 := v_0.AuxInt - sym2 := v_0.Aux - base := v_0.Args[0] - mem := v.Args[1] - if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2)) { - break - } - v.reset(Op386MOVOload) - v.AuxInt = off1 + off2 - v.Aux = mergeSym(sym1, sym2) - v.AddArg(base) - v.AddArg(mem) - return true - } - return false -} -func rewriteValue386_Op386MOVOstore(v *Value, config *Config) bool { - b := v.Block - _ = b - // match: (MOVOstore [off1] {sym} (ADDLconst [off2] ptr) val mem) - // cond: is32Bit(off1+off2) - // result: (MOVOstore [off1+off2] {sym} ptr val mem) - for { - off1 := v.AuxInt - sym := v.Aux - v_0 := v.Args[0] - if v_0.Op != Op386ADDLconst { - break - } - off2 := v_0.AuxInt - ptr := v_0.Args[0] - val := v.Args[1] - mem := v.Args[2] - if !(is32Bit(off1 + off2)) { - break - } - v.reset(Op386MOVOstore) - v.AuxInt = off1 + off2 - v.Aux = sym - v.AddArg(ptr) - v.AddArg(val) - v.AddArg(mem) - return true - } - // match: (MOVOstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) - // cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2) - // result: (MOVOstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) - for { - off1 := v.AuxInt - sym1 := v.Aux - v_0 := v.Args[0] - if v_0.Op != Op386LEAL { - break - } - off2 := v_0.AuxInt - sym2 := v_0.Aux - base := v_0.Args[0] - val := v.Args[1] - mem := v.Args[2] - if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2)) { - break - } - v.reset(Op386MOVOstore) - v.AuxInt = off1 + off2 - v.Aux = mergeSym(sym1, sym2) - v.AddArg(base) - v.AddArg(val) - v.AddArg(mem) - return true - } - return false -} func rewriteValue386_Op386MOVSDload(v *Value, config *Config) bool { b := v.Block _ = b @@ -9073,26 +9100,6 @@ func rewriteValue386_OpMove(v *Value, config *Config) bool { return true } // match: (Move [s] dst src mem) - // cond: SizeAndAlign(s).Size() == 16 - // result: (MOVOstore dst (MOVOload src mem) mem) - for { - s := v.AuxInt - dst := v.Args[0] - src := v.Args[1] - mem := v.Args[2] - if !(SizeAndAlign(s).Size() == 16) { - break - } - v.reset(Op386MOVOstore) - v.AddArg(dst) - v0 := b.NewValue0(v.Line, Op386MOVOload, TypeInt128) - v0.AddArg(src) - v0.AddArg(mem) - v.AddArg(v0) - v.AddArg(mem) - return true - } - // match: (Move [s] dst src mem) // cond: SizeAndAlign(s).Size() == 3 // result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem)) for { @@ -9209,32 +9216,92 @@ func rewriteValue386_OpMove(v *Value, config *Config) bool { return true } // match: (Move [s] dst src mem) - // cond: SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice - // result: (DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem) + // cond: SizeAndAlign(s).Size() == 8 + // result: (MOVLstore [4] dst (MOVLload [4] src mem) (MOVLstore dst (MOVLload src mem) mem)) + for { + s := v.AuxInt + dst := v.Args[0] + src := v.Args[1] + mem := v.Args[2] + if !(SizeAndAlign(s).Size() == 8) { + break + } + v.reset(Op386MOVLstore) + v.AuxInt = 4 + v.AddArg(dst) + v0 := b.NewValue0(v.Line, Op386MOVLload, config.fe.TypeUInt32()) + v0.AuxInt = 4 + v0.AddArg(src) + v0.AddArg(mem) + v.AddArg(v0) + v1 := b.NewValue0(v.Line, Op386MOVLstore, TypeMem) + v1.AddArg(dst) + v2 := b.NewValue0(v.Line, Op386MOVLload, config.fe.TypeUInt32()) + v2.AddArg(src) + v2.AddArg(mem) + v1.AddArg(v2) + v1.AddArg(mem) + v.AddArg(v1) + return true + } + // match: (Move [s] dst src mem) + // cond: SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0 + // result: (Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%4] (ADDLconst dst [SizeAndAlign(s).Size()%4]) (ADDLconst src [SizeAndAlign(s).Size()%4]) (MOVLstore dst (MOVLload src mem) mem)) for { s := v.AuxInt dst := v.Args[0] src := v.Args[1] mem := v.Args[2] - if !(SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice) { + if !(SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0) { + break + } + v.reset(OpMove) + v.AuxInt = SizeAndAlign(s).Size() - SizeAndAlign(s).Size()%4 + v0 := b.NewValue0(v.Line, Op386ADDLconst, dst.Type) + v0.AddArg(dst) + v0.AuxInt = SizeAndAlign(s).Size() % 4 + v.AddArg(v0) + v1 := b.NewValue0(v.Line, Op386ADDLconst, src.Type) + v1.AddArg(src) + v1.AuxInt = SizeAndAlign(s).Size() % 4 + v.AddArg(v1) + v2 := b.NewValue0(v.Line, Op386MOVLstore, TypeMem) + v2.AddArg(dst) + v3 := b.NewValue0(v.Line, Op386MOVLload, config.fe.TypeUInt32()) + v3.AddArg(src) + v3.AddArg(mem) + v2.AddArg(v3) + v2.AddArg(mem) + v.AddArg(v2) + return true + } + // match: (Move [s] dst src mem) + // cond: SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 && !config.noDuffDevice + // result: (DUFFCOPY [10*(128-SizeAndAlign(s).Size()/4)] dst src mem) + for { + s := v.AuxInt + dst := v.Args[0] + src := v.Args[1] + mem := v.Args[2] + if !(SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 && !config.noDuffDevice) { break } v.reset(Op386DUFFCOPY) - v.AuxInt = 14 * (64 - SizeAndAlign(s).Size()/16) + v.AuxInt = 10 * (128 - SizeAndAlign(s).Size()/4) v.AddArg(dst) v.AddArg(src) v.AddArg(mem) return true } // match: (Move [s] dst src mem) - // cond: (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0 + // cond: (SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0 // result: (REPMOVSL dst src (MOVLconst [SizeAndAlign(s).Size()/4]) mem) for { s := v.AuxInt dst := v.Args[0] src := v.Args[1] mem := v.Args[2] - if !((SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0) { + if !((SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0) { break } v.reset(Op386REPMOVSL) @@ -10006,32 +10073,16 @@ func rewriteValue386_OpOffPtr(v *Value, config *Config) bool { b := v.Block _ = b // match: (OffPtr [off] ptr) - // cond: is32Bit(off) + // cond: // result: (ADDLconst [off] ptr) for { off := v.AuxInt ptr := v.Args[0] - if !(is32Bit(off)) { - break - } v.reset(Op386ADDLconst) v.AuxInt = off v.AddArg(ptr) return true } - // match: (OffPtr [off] ptr) - // cond: - // result: (ADDL (MOVLconst [off]) ptr) - for { - off := v.AuxInt - ptr := v.Args[0] - v.reset(Op386ADDL) - v0 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32()) - v0.AuxInt = off - v.AddArg(v0) - v.AddArg(ptr) - return true - } } func rewriteValue386_OpOr16(v *Value, config *Config) bool { b := v.Block @@ -10243,6 +10294,45 @@ func rewriteValue386_OpRsh16Ux32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpRsh16Ux64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Rsh16Ux64 x (Const64 [c])) + // cond: uint64(c) < 16 + // result: (SHRWconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 16) { + break + } + v.reset(Op386SHRWconst) + v.AddArg(x) + v.AuxInt = c + return true + } + // match: (Rsh16Ux64 _ (Const64 [c])) + // cond: uint64(c) >= 16 + // result: (Const16 [0]) + for { + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) >= 16) { + break + } + v.reset(OpConst16) + v.AuxInt = 0 + return true + } + return false +} func rewriteValue386_OpRsh16Ux8(v *Value, config *Config) bool { b := v.Block _ = b @@ -10321,6 +10411,29 @@ func rewriteValue386_OpRsh16x32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpRsh16x64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Rsh16x64 x (Const64 [c])) + // cond: uint64(c) < 16 + // result: (SARWconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 16) { + break + } + v.reset(Op386SARWconst) + v.AddArg(x) + v.AuxInt = c + return true + } + return false +} func rewriteValue386_OpRsh16x8(v *Value, config *Config) bool { b := v.Block _ = b @@ -10396,6 +10509,45 @@ func rewriteValue386_OpRsh32Ux32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpRsh32Ux64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Rsh32Ux64 x (Const64 [c])) + // cond: uint64(c) < 32 + // result: (SHRLconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 32) { + break + } + v.reset(Op386SHRLconst) + v.AddArg(x) + v.AuxInt = c + return true + } + // match: (Rsh32Ux64 _ (Const64 [c])) + // cond: uint64(c) >= 32 + // result: (Const32 [0]) + for { + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) >= 32) { + break + } + v.reset(OpConst32) + v.AuxInt = 0 + return true + } + return false +} func rewriteValue386_OpRsh32Ux8(v *Value, config *Config) bool { b := v.Block _ = b @@ -10474,6 +10626,29 @@ func rewriteValue386_OpRsh32x32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpRsh32x64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Rsh32x64 x (Const64 [c])) + // cond: uint64(c) < 32 + // result: (SARLconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 32) { + break + } + v.reset(Op386SARLconst) + v.AddArg(x) + v.AuxInt = c + return true + } + return false +} func rewriteValue386_OpRsh32x8(v *Value, config *Config) bool { b := v.Block _ = b @@ -10549,6 +10724,45 @@ func rewriteValue386_OpRsh8Ux32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpRsh8Ux64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Rsh8Ux64 x (Const64 [c])) + // cond: uint64(c) < 8 + // result: (SHRBconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 8) { + break + } + v.reset(Op386SHRBconst) + v.AddArg(x) + v.AuxInt = c + return true + } + // match: (Rsh8Ux64 _ (Const64 [c])) + // cond: uint64(c) >= 8 + // result: (Const8 [0]) + for { + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) >= 8) { + break + } + v.reset(OpConst8) + v.AuxInt = 0 + return true + } + return false +} func rewriteValue386_OpRsh8Ux8(v *Value, config *Config) bool { b := v.Block _ = b @@ -10627,6 +10841,29 @@ func rewriteValue386_OpRsh8x32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpRsh8x64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Rsh8x64 x (Const64 [c])) + // cond: uint64(c) < 8 + // result: (SARBconst x [c]) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpConst64 { + break + } + c := v_1.AuxInt + if !(uint64(c) < 8) { + break + } + v.reset(Op386SARBconst) + v.AddArg(x) + v.AuxInt = c + return true + } + return false +} func rewriteValue386_OpRsh8x8(v *Value, config *Config) bool { b := v.Block _ = b @@ -12014,6 +12251,20 @@ func rewriteValue386_OpSignExt8to32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpSignmask(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Signmask x) + // cond: + // result: (SARLconst x [31]) + for { + x := v.Args[0] + v.reset(Op386SARLconst) + v.AddArg(x) + v.AuxInt = 31 + return true + } +} func rewriteValue386_OpSqrt(v *Value, config *Config) bool { b := v.Block _ = b @@ -12681,19 +12932,38 @@ func rewriteValue386_OpZero(v *Value, config *Config) bool { return true } // match: (Zero [s] destptr mem) - // cond: (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32)) && SizeAndAlign(s).Size()%8 == 0 - // result: (REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/8]) (MOVLconst [0]) mem) + // cond: SizeAndAlign(s).Size() > 16 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 && !config.noDuffDevice + // result: (DUFFZERO [1*(128-SizeAndAlign(s).Size()/4)] destptr (MOVLconst [0]) mem) + for { + s := v.AuxInt + destptr := v.Args[0] + mem := v.Args[1] + if !(SizeAndAlign(s).Size() > 16 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0 && !config.noDuffDevice) { + break + } + v.reset(Op386DUFFZERO) + v.AuxInt = 1 * (128 - SizeAndAlign(s).Size()/4) + v.AddArg(destptr) + v0 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32()) + v0.AuxInt = 0 + v.AddArg(v0) + v.AddArg(mem) + return true + } + // match: (Zero [s] destptr mem) + // cond: (SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16)) && SizeAndAlign(s).Size()%4 == 0 + // result: (REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/4]) (MOVLconst [0]) mem) for { s := v.AuxInt destptr := v.Args[0] mem := v.Args[1] - if !((SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32)) && SizeAndAlign(s).Size()%8 == 0) { + if !((SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16)) && SizeAndAlign(s).Size()%4 == 0) { break } v.reset(Op386REPSTOSL) v.AddArg(destptr) v0 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32()) - v0.AuxInt = SizeAndAlign(s).Size() / 8 + v0.AuxInt = SizeAndAlign(s).Size() / 4 v.AddArg(v0) v1 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32()) v1.AuxInt = 0 @@ -12742,6 +13012,24 @@ func rewriteValue386_OpZeroExt8to32(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpZeromask(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Zeromask x) + // cond: + // result: (SBBLcarrymask (CMPL (MOVLconst [0]) x)) + for { + x := v.Args[0] + v.reset(Op386SBBLcarrymask) + v0 := b.NewValue0(v.Line, Op386CMPL, TypeFlags) + v1 := b.NewValue0(v.Line, Op386MOVLconst, config.fe.TypeUInt32()) + v1.AuxInt = 0 + v0.AddArg(v1) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} func rewriteBlock386(b *Block) bool { switch b.Kind { case Block386EQ: diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 9888d065cd..01c268f70b 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -17415,7 +17415,7 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool { } // match: (Zero [s] destptr mem) // cond: SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice - // result: (DUFFZERO [duffStart(SizeAndAlign(s).Size())] (ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) mem) + // result: (DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())] (ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) mem) for { s := v.AuxInt destptr := v.Args[0] @@ -17424,9 +17424,9 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool { break } v.reset(OpAMD64DUFFZERO) - v.AuxInt = duffStart(SizeAndAlign(s).Size()) + v.AuxInt = duffStartAMD64(SizeAndAlign(s).Size()) v0 := b.NewValue0(v.Line, OpAMD64ADDQconst, config.fe.TypeUInt64()) - v0.AuxInt = duffAdj(SizeAndAlign(s).Size()) + v0.AuxInt = duffAdjAMD64(SizeAndAlign(s).Size()) v0.AddArg(destptr) v.AddArg(v0) v1 := b.NewValue0(v.Line, OpAMD64MOVOconst, TypeInt128) diff --git a/src/cmd/compile/internal/x86/ssa.go b/src/cmd/compile/internal/x86/ssa.go index ab6410b1c3..f83afa1a58 100644 --- a/src/cmd/compile/internal/x86/ssa.go +++ b/src/cmd/compile/internal/x86/ssa.go @@ -101,11 +101,14 @@ func storeByType(t ssa.Type) obj.As { // moveByType returns the reg->reg move instruction of the given type. func moveByType(t ssa.Type) obj.As { if t.IsFloat() { - // Moving the whole sse2 register is faster - // than moving just the correct low portion of it. - // There is no xmm->xmm move with 1 byte opcode, - // so use movups, which has 2 byte opcode. - return x86.AMOVUPS + switch t.Size() { + case 4: + return x86.AMOVSS + case 8: + return x86.AMOVSD + default: + panic(fmt.Sprintf("bad float register width %d:%s", t.Size(), t)) + } } else { switch t.Size() { case 1: @@ -115,8 +118,6 @@ func moveByType(t ssa.Type) obj.As { return x86.AMOVL case 4: return x86.AMOVL - case 16: - return x86.AMOVUPS // int128s are in SSE registers default: panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t)) } @@ -448,7 +449,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.From.Val = math.Float64frombits(uint64(v.AuxInt)) p.To.Type = obj.TYPE_REG p.To.Reg = x - case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload, ssa.Op386MOVOload: + case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) @@ -496,7 +497,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) - case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore, ssa.Op386MOVOstore: + case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[1]) @@ -584,12 +585,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Offset = v.AuxInt - case ssa.Op386MOVOconst: - if v.AuxInt != 0 { - v.Unimplementedf("MOVOconst can only do constant=0") - } - r := gc.SSARegNum(v) - opregreg(x86.AXORPS, r, r) case ssa.Op386DUFFCOPY: p := gc.Prog(obj.ADUFFCOPY) p.To.Type = obj.TYPE_ADDR @@ -828,8 +823,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { case ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload, - ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVOload, - ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVOstore: + ssa.Op386MOVSSload, ssa.Op386MOVSDload, + ssa.Op386MOVSSstore, ssa.Op386MOVSDstore: if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage { if gc.Debug_checknil != 0 && int(v.Line) > 1 { gc.Warnl(v.Line, "removed nil check") -- 2.48.1