From 4e182db5fc876564a4f87a0602c58ea0ddc6e37c Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Tue, 12 Aug 2025 15:14:13 -0700 Subject: [PATCH] Revert "cmd/compile: use generated loops instead of DUFFCOPY on amd64" This reverts commit ec9e1176c3209cf92e73e3deb2d8073fab5ea4d6 (CL 678620). Reason for revert: causing regalloc to get into an infinite loop Change-Id: Ie53c58c6126804af6d6883ea4acdcfb632a172bd Reviewed-on: https://go-review.googlesource.com/c/go/+/695196 LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Auto-Submit: Keith Randall Reviewed-by: Keith Randall Reviewed-by: Dmitri Shuralyov --- src/cmd/compile/internal/amd64/ssa.go | 173 ++++++------------ src/cmd/compile/internal/ssa/_gen/AMD64.rules | 49 +++-- src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 34 +--- src/cmd/compile/internal/ssa/opGen.go | 37 +--- src/cmd/compile/internal/ssa/regalloc.go | 9 +- src/cmd/compile/internal/ssa/regalloc_test.go | 24 --- src/cmd/compile/internal/ssa/rewrite.go | 1 - src/cmd/compile/internal/ssa/rewriteAMD64.go | 139 +++++++++++--- 8 files changed, 217 insertions(+), 249 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 8c8c7d9027..625e725fe3 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -142,6 +142,45 @@ func memIdx(a *obj.Addr, v *ssa.Value) { a.Index = i } +// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, +// See runtime/mkduff.go. +const ( + dzBlocks = 16 // number of MOV/ADD blocks + dzBlockLen = 4 // number of clears per block + dzBlockSize = 23 // size of instructions in a single block + dzMovSize = 5 // size of single MOV instruction w/ offset + dzLeaqSize = 4 // size of single LEAQ instruction + dzClearStep = 16 // number of bytes cleared by each MOV instruction +) + +func duffStart(size int64) int64 { + x, _ := duff(size) + return x +} +func duffAdj(size int64) int64 { + _, x := duff(size) + return x +} + +// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes) +// required to use the duffzero mechanism for a block of the given size. +func duff(size int64) (int64, int64) { + if size < 32 || size > 1024 || size%dzClearStep != 0 { + panic("bad duffzero size") + } + steps := size / dzClearStep + blocks := steps / dzBlockLen + steps %= dzBlockLen + off := dzBlockSize * (dzBlocks - blocks) + var adj int64 + if steps != 0 { + off -= dzLeaqSize + off -= dzMovSize * steps + adj -= dzClearStep * (dzBlockLen - steps) + } + return off, adj +} + func getgFromTLS(s *ssagen.State, r int16) { // See the comments in cmd/internal/obj/x86/obj6.go // near CanUse1InsnTLS for a detailed explanation of these instructions. @@ -1065,110 +1104,20 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { zero16(off + n - 16) } - case ssa.OpAMD64LoweredMove: - dstReg := v.Args[0].Reg() - srcReg := v.Args[1].Reg() - if dstReg == srcReg { - break - } - tmpReg := int16(x86.REG_X14) - n := v.AuxInt - if n < 16 { - v.Fatalf("Move too small %d", n) - } - // move 16 bytes from srcReg+off to dstReg+off. - move16 := func(off int64) { - move16(s, srcReg, dstReg, tmpReg, off) - } - - // Generate copying instructions. - var off int64 - for n >= 16 { - move16(off) - off += 16 - n -= 16 - } - if n != 0 { - // use partially overlapped read/write. - // TODO: use smaller operations when we can? - move16(off + n - 16) - } - - case ssa.OpAMD64LoweredMoveLoop: - dstReg := v.Args[0].Reg() - srcReg := v.Args[1].Reg() - if dstReg == srcReg { - break - } - countReg := v.RegTmp() - tmpReg := int16(x86.REG_X14) - n := v.AuxInt - loopSize := int64(64) - if n < 3*loopSize { - // - a loop count of 0 won't work. - // - a loop count of 1 is useless. - // - a loop count of 2 is a code size ~tie - // 4 instructions to implement the loop - // 4 instructions in the loop body - // vs - // 8 instructions in the straightline code - // Might as well use straightline code. - v.Fatalf("ZeroLoop size too small %d", n) - } - // move 16 bytes from srcReg+off to dstReg+off. - move16 := func(off int64) { - move16(s, srcReg, dstReg, tmpReg, off) - } - - // Put iteration count in a register. - // MOVL $n, countReg - p := s.Prog(x86.AMOVL) - p.From.Type = obj.TYPE_CONST - p.From.Offset = n / loopSize - p.To.Type = obj.TYPE_REG - p.To.Reg = countReg - cntInit := p - - // Copy loopSize bytes starting at srcReg to dstReg. - for i := range loopSize / 16 { - move16(i * 16) - } - // ADDQ $loopSize, srcReg - p = s.Prog(x86.AADDQ) - p.From.Type = obj.TYPE_CONST - p.From.Offset = loopSize - p.To.Type = obj.TYPE_REG - p.To.Reg = srcReg - // ADDQ $loopSize, dstReg - p = s.Prog(x86.AADDQ) - p.From.Type = obj.TYPE_CONST - p.From.Offset = loopSize - p.To.Type = obj.TYPE_REG - p.To.Reg = dstReg - // DECL countReg - p = s.Prog(x86.ADECL) - p.To.Type = obj.TYPE_REG - p.To.Reg = countReg - // Jump to loop header if we're not done yet. - // JNE head - p = s.Prog(x86.AJNE) - p.To.Type = obj.TYPE_BRANCH - p.To.SetTarget(cntInit.Link) - - // Multiples of the loop size are now done. - n %= loopSize - - // Copy any fractional portion. - var off int64 - for n >= 16 { - move16(off) - off += 16 - n -= 16 - } - if n != 0 { - // Use partially-overlapping copy. - move16(off + n - 16) + case ssa.OpAMD64DUFFCOPY: + p := s.Prog(obj.ADUFFCOPY) + p.To.Type = obj.TYPE_ADDR + p.To.Sym = ir.Syms.Duffcopy + if v.AuxInt%16 != 0 { + v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt) } + p.To.Offset = 14 * (64 - v.AuxInt/16) + // 14 and 64 are magic constants. 14 is the number of bytes to encode: + // MOVUPS (SI), X0 + // ADDQ $16, SI + // MOVUPS X0, (DI) + // ADDQ $16, DI + // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy. case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy? if v.Type.IsMemory() { @@ -1760,21 +1709,3 @@ func zero16(s *ssagen.State, reg int16, off int64) { p.To.Reg = reg p.To.Offset = off } - -// move 16 bytes from src+off to dst+off using temporary register tmp. -func move16(s *ssagen.State, src, dst, tmp int16, off int64) { - // MOVUPS off(srcReg), tmpReg - // MOVUPS tmpReg, off(dstReg) - p := s.Prog(x86.AMOVUPS) - p.From.Type = obj.TYPE_MEM - p.From.Reg = src - p.From.Offset = off - p.To.Type = obj.TYPE_REG - p.To.Reg = tmp - p = s.Prog(x86.AMOVUPS) - p.From.Type = obj.TYPE_REG - p.From.Reg = tmp - p.To.Type = obj.TYPE_MEM - p.To.Reg = dst - p.To.Offset = off -} diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 7d3efef5cd..1e0a599570 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -264,6 +264,24 @@ (Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem) (Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem) +(Move [32] dst src mem) => + (Move [16] + (OffPtr dst [16]) + (OffPtr src [16]) + (Move [16] dst src mem)) + +(Move [48] dst src mem) => + (Move [32] + (OffPtr dst [16]) + (OffPtr src [16]) + (Move [16] dst src mem)) + +(Move [64] dst src mem) => + (Move [32] + (OffPtr dst [32]) + (OffPtr src [32]) + (Move [32] dst src mem)) + (Move [3] dst src mem) => (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem)) @@ -292,19 +310,28 @@ (MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem) (MOVQstore dst (MOVQload src mem) mem)) -// Copying up to 192 bytes uses straightline code. -(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem) - -// Copying up to ~1KB uses a small loop. -(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem) +// Adjust moves to be a multiple of 16 bytes. +(Move [s] dst src mem) + && s > 16 && s%16 != 0 && s%16 <= 8 => + (Move [s-s%16] + (OffPtr dst [s%16]) + (OffPtr src [s%16]) + (MOVQstore dst (MOVQload src mem) mem)) +(Move [s] dst src mem) + && s > 16 && s%16 != 0 && s%16 > 8 => + (Move [s-s%16] + (OffPtr dst [s%16]) + (OffPtr src [s%16]) + (MOVOstore dst (MOVOload src mem) mem)) + +// Medium copying uses a duff device. +(Move [s] dst src mem) + && s > 64 && s <= 16*64 && s%16 == 0 + && logLargeCopy(v, s) => + (DUFFCOPY [s] dst src mem) // Large copying uses REP MOVSQ. -(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 => - (Move [s-s%8] - (OffPtr dst [s%8]) - (OffPtr src [s%8]) - (MOVQstore dst (MOVQload src mem) mem)) -(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) => +(Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) => (REPMOVSQ dst src (MOVQconst [s/8]) mem) // Lowering Zero instructions diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index e42b54398d..b6c019f28a 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -939,38 +939,20 @@ func init() { // arg0 = destination pointer // arg1 = source pointer // arg2 = mem - // auxint = # of bytes to copy + // auxint = # of bytes to copy, must be multiple of 16 // returns memory { - name: "LoweredMove", + name: "DUFFCOPY", aux: "Int64", argLength: 3, reg: regInfo{ - inputs: []regMask{gp, gp}, - clobbers: buildReg("X14"), // uses X14 as a temporary + inputs: []regMask{buildReg("DI"), buildReg("SI")}, + clobbers: buildReg("DI SI X0"), // uses X0 as a temporary }, - faultOnNilArg0: true, - faultOnNilArg1: true, - }, - // arg0 = destination pointer - // arg1 = source pointer - // arg2 = mem - // auxint = # of bytes to copy - // returns memory - { - name: "LoweredMoveLoop", - aux: "Int64", - argLength: 3, - reg: regInfo{ - inputs: []regMask{gp, gp}, - clobbers: buildReg("X14"), // uses X14 as a temporary - clobbersArg0: true, - clobbersArg1: true, - }, - clobberFlags: true, - faultOnNilArg0: true, - faultOnNilArg1: true, - needIntTemp: true, + clobberFlags: true, + //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point + //faultOnNilArg1: true, + unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts }, // arg0 = destination pointer diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 95f8d48a61..06a05c6e3f 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1058,8 +1058,7 @@ const ( OpAMD64CALLtail OpAMD64CALLclosure OpAMD64CALLinter - OpAMD64LoweredMove - OpAMD64LoweredMoveLoop + OpAMD64DUFFCOPY OpAMD64REPMOVSQ OpAMD64InvertFlags OpAMD64LoweredGetG @@ -13966,35 +13965,17 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "LoweredMove", - auxType: auxInt64, - argLen: 3, - faultOnNilArg0: true, - faultOnNilArg1: true, - reg: regInfo{ - inputs: []inputInfo{ - {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 - {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 - }, - clobbers: 1073741824, // X14 - }, - }, - { - name: "LoweredMoveLoop", - auxType: auxInt64, - argLen: 3, - clobberFlags: true, - needIntTemp: true, - faultOnNilArg0: true, - faultOnNilArg1: true, + name: "DUFFCOPY", + auxType: auxInt64, + argLen: 3, + clobberFlags: true, + unsafePoint: true, reg: regInfo{ inputs: []inputInfo{ - {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 - {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + {0, 128}, // DI + {1, 64}, // SI }, - clobbers: 1073741824, // X14 - clobbersArg0: true, - clobbersArg1: true, + clobbers: 65728, // SI DI X0 }, }, { diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go index 42dae0088d..c0881c7a45 100644 --- a/src/cmd/compile/internal/ssa/regalloc.go +++ b/src/cmd/compile/internal/ssa/regalloc.go @@ -561,14 +561,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos pos = pos.WithNotStmt() // Check if v is already in a requested register. if mask&vi.regs != 0 { - mask &= vi.regs - r := pickReg(mask) - if mask.contains(s.SPReg) { - // Prefer the stack pointer if it is allowed. - // (Needed because the op might have an Aux symbol - // that needs SP as its base.) - r = s.SPReg - } + r := pickReg(mask & vi.regs) if !s.allocatable.contains(r) { return v // v is in a fixed register } diff --git a/src/cmd/compile/internal/ssa/regalloc_test.go b/src/cmd/compile/internal/ssa/regalloc_test.go index 0f69b852d1..e7ed416c50 100644 --- a/src/cmd/compile/internal/ssa/regalloc_test.go +++ b/src/cmd/compile/internal/ssa/regalloc_test.go @@ -240,30 +240,6 @@ func TestClobbersArg0(t *testing.T) { } } -func TestClobbersArg1(t *testing.T) { - c := testConfig(t) - f := c.Fun("entry", - Bloc("entry", - Valu("mem", OpInitMem, types.TypeMem, 0, nil), - Valu("src", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())), - Valu("dst", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())), - Valu("use1", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())), - Valu("use2", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())), - Valu("move", OpAMD64LoweredMoveLoop, types.TypeMem, 256, nil, "dst", "src", "mem"), - Valu("store1", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use1", "src", "move"), - Valu("store2", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use2", "dst", "store1"), - Exit("store2"))) - flagalloc(f.f) - regalloc(f.f) - checkFunc(f.f) - // LoweredMoveLoop clobbers its arguments, so there must be a copy of "src" and "dst" somewhere - // so we still have that value available at the stores. - if n := numCopies(f.blocks["entry"]); n != 2 { - fmt.Printf("%s\n", f.f.String()) - t.Errorf("got %d copies, want 2", n) - } -} - func numSpills(b *Block) int { return numOps(b, OpStoreReg) } diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 236a3f885a..2239927521 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -31,7 +31,6 @@ const ( removeDeadValues = true repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing - repMoveThreshold = 1408 // size beyond which we use REP MOVS for copying ) // deadcode indicates whether rewrite should try to remove any values that become dead. diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index a7ee632ae1..c83890aee6 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -27307,6 +27307,75 @@ func rewriteValueAMD64_OpMove(v *Value) bool { v.AddArg3(dst, v0, mem) return true } + // match: (Move [32] dst src mem) + // result: (Move [16] (OffPtr dst [16]) (OffPtr src [16]) (Move [16] dst src mem)) + for { + if auxIntToInt64(v.AuxInt) != 32 { + break + } + dst := v_0 + src := v_1 + mem := v_2 + v.reset(OpMove) + v.AuxInt = int64ToAuxInt(16) + v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) + v0.AuxInt = int64ToAuxInt(16) + v0.AddArg(dst) + v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) + v1.AuxInt = int64ToAuxInt(16) + v1.AddArg(src) + v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) + v2.AuxInt = int64ToAuxInt(16) + v2.AddArg3(dst, src, mem) + v.AddArg3(v0, v1, v2) + return true + } + // match: (Move [48] dst src mem) + // result: (Move [32] (OffPtr dst [16]) (OffPtr src [16]) (Move [16] dst src mem)) + for { + if auxIntToInt64(v.AuxInt) != 48 { + break + } + dst := v_0 + src := v_1 + mem := v_2 + v.reset(OpMove) + v.AuxInt = int64ToAuxInt(32) + v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) + v0.AuxInt = int64ToAuxInt(16) + v0.AddArg(dst) + v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) + v1.AuxInt = int64ToAuxInt(16) + v1.AddArg(src) + v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) + v2.AuxInt = int64ToAuxInt(16) + v2.AddArg3(dst, src, mem) + v.AddArg3(v0, v1, v2) + return true + } + // match: (Move [64] dst src mem) + // result: (Move [32] (OffPtr dst [32]) (OffPtr src [32]) (Move [32] dst src mem)) + for { + if auxIntToInt64(v.AuxInt) != 64 { + break + } + dst := v_0 + src := v_1 + mem := v_2 + v.reset(OpMove) + v.AuxInt = int64ToAuxInt(32) + v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) + v0.AuxInt = int64ToAuxInt(32) + v0.AddArg(dst) + v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) + v1.AuxInt = int64ToAuxInt(32) + v1.AddArg(src) + v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) + v2.AuxInt = int64ToAuxInt(32) + v2.AddArg3(dst, src, mem) + v.AddArg3(v0, v1, v2) + return true + } // match: (Move [3] dst src mem) // result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem)) for { @@ -27499,72 +27568,82 @@ func rewriteValueAMD64_OpMove(v *Value) bool { return true } // match: (Move [s] dst src mem) - // cond: s > 16 && s < 192 && logLargeCopy(v, s) - // result: (LoweredMove [s] dst src mem) + // cond: s > 16 && s%16 != 0 && s%16 <= 8 + // result: (Move [s-s%16] (OffPtr dst [s%16]) (OffPtr src [s%16]) (MOVQstore dst (MOVQload src mem) mem)) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s > 16 && s < 192 && logLargeCopy(v, s)) { + if !(s > 16 && s%16 != 0 && s%16 <= 8) { break } - v.reset(OpAMD64LoweredMove) - v.AuxInt = int64ToAuxInt(s) - v.AddArg3(dst, src, mem) + v.reset(OpMove) + v.AuxInt = int64ToAuxInt(s - s%16) + v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) + v0.AuxInt = int64ToAuxInt(s % 16) + v0.AddArg(dst) + v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) + v1.AuxInt = int64ToAuxInt(s % 16) + v1.AddArg(src) + v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem) + v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64) + v3.AddArg2(src, mem) + v2.AddArg3(dst, v3, mem) + v.AddArg3(v0, v1, v2) return true } // match: (Move [s] dst src mem) - // cond: s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) - // result: (LoweredMoveLoop [s] dst src mem) + // cond: s > 16 && s%16 != 0 && s%16 > 8 + // result: (Move [s-s%16] (OffPtr dst [s%16]) (OffPtr src [s%16]) (MOVOstore dst (MOVOload src mem) mem)) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)) { + if !(s > 16 && s%16 != 0 && s%16 > 8) { break } - v.reset(OpAMD64LoweredMoveLoop) - v.AuxInt = int64ToAuxInt(s) - v.AddArg3(dst, src, mem) + v.reset(OpMove) + v.AuxInt = int64ToAuxInt(s - s%16) + v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) + v0.AuxInt = int64ToAuxInt(s % 16) + v0.AddArg(dst) + v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) + v1.AuxInt = int64ToAuxInt(s % 16) + v1.AddArg(src) + v2 := b.NewValue0(v.Pos, OpAMD64MOVOstore, types.TypeMem) + v3 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128) + v3.AddArg2(src, mem) + v2.AddArg3(dst, v3, mem) + v.AddArg3(v0, v1, v2) return true } // match: (Move [s] dst src mem) - // cond: s > repMoveThreshold && s%8 != 0 - // result: (Move [s-s%8] (OffPtr dst [s%8]) (OffPtr src [s%8]) (MOVQstore dst (MOVQload src mem) mem)) + // cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s) + // result: (DUFFCOPY [s] dst src mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s > repMoveThreshold && s%8 != 0) { + if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) { break } - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(s - s%8) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(s % 8) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(s % 8) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem) - v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64) - v3.AddArg2(src, mem) - v2.AddArg3(dst, v3, mem) - v.AddArg3(v0, v1, v2) + v.reset(OpAMD64DUFFCOPY) + v.AuxInt = int64ToAuxInt(s) + v.AddArg3(dst, src, mem) return true } // match: (Move [s] dst src mem) - // cond: s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) + // cond: s > 16*64 && s%8 == 0 && logLargeCopy(v, s) // result: (REPMOVSQ dst src (MOVQconst [s/8]) mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s)) { + if !(s > 16*64 && s%8 == 0 && logLargeCopy(v, s)) { break } v.reset(OpAMD64REPMOVSQ) -- 2.51.0