From ca66f907dd44d57c93b2dc5ecafcb8addf2b23c3 Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Wed, 13 Aug 2025 09:41:17 -0700 Subject: [PATCH] cmd/compile: use generated loops instead of DUFFCOPY on amd64 This reverts commit 4e182db5fc876564a4f87a0602c58ea0ddc6e37c (CL 695196), which is itself a revert of ec9e1176c3209cf92e73e3deb2d8073fab5ea4d6 (CL 678620). So this CL is exactly the same as CL 678620, but with a regalloc fix (CL 696035) submitted first. Change-Id: I743ab32fa3aa6ef3e1b2b6751a2ef4519139057c Reviewed-on: https://go-review.googlesource.com/c/go/+/696016 Reviewed-by: David Chase Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/ssa.go | 173 ++++++++++++------ src/cmd/compile/internal/ssa/_gen/AMD64.rules | 49 ++--- src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 34 +++- src/cmd/compile/internal/ssa/opGen.go | 37 +++- src/cmd/compile/internal/ssa/regalloc.go | 9 +- src/cmd/compile/internal/ssa/regalloc_test.go | 24 +++ src/cmd/compile/internal/ssa/rewrite.go | 1 + src/cmd/compile/internal/ssa/rewriteAMD64.go | 139 +++----------- 8 files changed, 249 insertions(+), 217 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 625e725fe3..8c8c7d9027 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -142,45 +142,6 @@ func memIdx(a *obj.Addr, v *ssa.Value) { a.Index = i } -// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, -// See runtime/mkduff.go. -const ( - dzBlocks = 16 // number of MOV/ADD blocks - dzBlockLen = 4 // number of clears per block - dzBlockSize = 23 // size of instructions in a single block - dzMovSize = 5 // size of single MOV instruction w/ offset - dzLeaqSize = 4 // size of single LEAQ instruction - dzClearStep = 16 // number of bytes cleared by each MOV instruction -) - -func duffStart(size int64) int64 { - x, _ := duff(size) - return x -} -func duffAdj(size int64) int64 { - _, x := duff(size) - return x -} - -// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes) -// required to use the duffzero mechanism for a block of the given size. -func duff(size int64) (int64, int64) { - if size < 32 || size > 1024 || size%dzClearStep != 0 { - panic("bad duffzero size") - } - steps := size / dzClearStep - blocks := steps / dzBlockLen - steps %= dzBlockLen - off := dzBlockSize * (dzBlocks - blocks) - var adj int64 - if steps != 0 { - off -= dzLeaqSize - off -= dzMovSize * steps - adj -= dzClearStep * (dzBlockLen - steps) - } - return off, adj -} - func getgFromTLS(s *ssagen.State, r int16) { // See the comments in cmd/internal/obj/x86/obj6.go // near CanUse1InsnTLS for a detailed explanation of these instructions. @@ -1104,20 +1065,110 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { zero16(off + n - 16) } - case ssa.OpAMD64DUFFCOPY: - p := s.Prog(obj.ADUFFCOPY) - p.To.Type = obj.TYPE_ADDR - p.To.Sym = ir.Syms.Duffcopy - if v.AuxInt%16 != 0 { - v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt) + case ssa.OpAMD64LoweredMove: + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + if dstReg == srcReg { + break + } + tmpReg := int16(x86.REG_X14) + n := v.AuxInt + if n < 16 { + v.Fatalf("Move too small %d", n) + } + // move 16 bytes from srcReg+off to dstReg+off. + move16 := func(off int64) { + move16(s, srcReg, dstReg, tmpReg, off) + } + + // Generate copying instructions. + var off int64 + for n >= 16 { + move16(off) + off += 16 + n -= 16 + } + if n != 0 { + // use partially overlapped read/write. + // TODO: use smaller operations when we can? + move16(off + n - 16) + } + + case ssa.OpAMD64LoweredMoveLoop: + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + if dstReg == srcReg { + break + } + countReg := v.RegTmp() + tmpReg := int16(x86.REG_X14) + n := v.AuxInt + loopSize := int64(64) + if n < 3*loopSize { + // - a loop count of 0 won't work. + // - a loop count of 1 is useless. + // - a loop count of 2 is a code size ~tie + // 4 instructions to implement the loop + // 4 instructions in the loop body + // vs + // 8 instructions in the straightline code + // Might as well use straightline code. + v.Fatalf("ZeroLoop size too small %d", n) + } + // move 16 bytes from srcReg+off to dstReg+off. + move16 := func(off int64) { + move16(s, srcReg, dstReg, tmpReg, off) + } + + // Put iteration count in a register. + // MOVL $n, countReg + p := s.Prog(x86.AMOVL) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n / loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + cntInit := p + + // Copy loopSize bytes starting at srcReg to dstReg. + for i := range loopSize / 16 { + move16(i * 16) + } + // ADDQ $loopSize, srcReg + p = s.Prog(x86.AADDQ) + p.From.Type = obj.TYPE_CONST + p.From.Offset = loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = srcReg + // ADDQ $loopSize, dstReg + p = s.Prog(x86.AADDQ) + p.From.Type = obj.TYPE_CONST + p.From.Offset = loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = dstReg + // DECL countReg + p = s.Prog(x86.ADECL) + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + // Jump to loop header if we're not done yet. + // JNE head + p = s.Prog(x86.AJNE) + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(cntInit.Link) + + // Multiples of the loop size are now done. + n %= loopSize + + // Copy any fractional portion. + var off int64 + for n >= 16 { + move16(off) + off += 16 + n -= 16 + } + if n != 0 { + // Use partially-overlapping copy. + move16(off + n - 16) } - p.To.Offset = 14 * (64 - v.AuxInt/16) - // 14 and 64 are magic constants. 14 is the number of bytes to encode: - // MOVUPS (SI), X0 - // ADDQ $16, SI - // MOVUPS X0, (DI) - // ADDQ $16, DI - // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy. case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy? if v.Type.IsMemory() { @@ -1709,3 +1760,21 @@ func zero16(s *ssagen.State, reg int16, off int64) { p.To.Reg = reg p.To.Offset = off } + +// move 16 bytes from src+off to dst+off using temporary register tmp. +func move16(s *ssagen.State, src, dst, tmp int16, off int64) { + // MOVUPS off(srcReg), tmpReg + // MOVUPS tmpReg, off(dstReg) + p := s.Prog(x86.AMOVUPS) + p.From.Type = obj.TYPE_MEM + p.From.Reg = src + p.From.Offset = off + p.To.Type = obj.TYPE_REG + p.To.Reg = tmp + p = s.Prog(x86.AMOVUPS) + p.From.Type = obj.TYPE_REG + p.From.Reg = tmp + p.To.Type = obj.TYPE_MEM + p.To.Reg = dst + p.To.Offset = off +} diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 1e0a599570..7d3efef5cd 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -264,24 +264,6 @@ (Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem) (Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem) -(Move [32] dst src mem) => - (Move [16] - (OffPtr dst [16]) - (OffPtr src [16]) - (Move [16] dst src mem)) - -(Move [48] dst src mem) => - (Move [32] - (OffPtr dst [16]) - (OffPtr src [16]) - (Move [16] dst src mem)) - -(Move [64] dst src mem) => - (Move [32] - (OffPtr dst [32]) - (OffPtr src [32]) - (Move [32] dst src mem)) - (Move [3] dst src mem) => (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem)) @@ -310,28 +292,19 @@ (MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem) (MOVQstore dst (MOVQload src mem) mem)) -// Adjust moves to be a multiple of 16 bytes. -(Move [s] dst src mem) - && s > 16 && s%16 != 0 && s%16 <= 8 => - (Move [s-s%16] - (OffPtr dst [s%16]) - (OffPtr src [s%16]) - (MOVQstore dst (MOVQload src mem) mem)) -(Move [s] dst src mem) - && s > 16 && s%16 != 0 && s%16 > 8 => - (Move [s-s%16] - (OffPtr dst [s%16]) - (OffPtr src [s%16]) - (MOVOstore dst (MOVOload src mem) mem)) - -// Medium copying uses a duff device. -(Move [s] dst src mem) - && s > 64 && s <= 16*64 && s%16 == 0 - && logLargeCopy(v, s) => - (DUFFCOPY [s] dst src mem) +// Copying up to 192 bytes uses straightline code. +(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem) + +// Copying up to ~1KB uses a small loop. +(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem) // Large copying uses REP MOVSQ. -(Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) => +(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 => + (Move [s-s%8] + (OffPtr dst [s%8]) + (OffPtr src [s%8]) + (MOVQstore dst (MOVQload src mem) mem)) +(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) => (REPMOVSQ dst src (MOVQconst [s/8]) mem) // Lowering Zero instructions diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index b6c019f28a..e42b54398d 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -939,20 +939,38 @@ func init() { // arg0 = destination pointer // arg1 = source pointer // arg2 = mem - // auxint = # of bytes to copy, must be multiple of 16 + // auxint = # of bytes to copy // returns memory { - name: "DUFFCOPY", + name: "LoweredMove", aux: "Int64", argLength: 3, reg: regInfo{ - inputs: []regMask{buildReg("DI"), buildReg("SI")}, - clobbers: buildReg("DI SI X0"), // uses X0 as a temporary + inputs: []regMask{gp, gp}, + clobbers: buildReg("X14"), // uses X14 as a temporary }, - clobberFlags: true, - //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point - //faultOnNilArg1: true, - unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + // arg0 = destination pointer + // arg1 = source pointer + // arg2 = mem + // auxint = # of bytes to copy + // returns memory + { + name: "LoweredMoveLoop", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{gp, gp}, + clobbers: buildReg("X14"), // uses X14 as a temporary + clobbersArg0: true, + clobbersArg1: true, + }, + clobberFlags: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + needIntTemp: true, }, // arg0 = destination pointer diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index e155eca5ff..a62a4b1cf6 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1058,7 +1058,8 @@ const ( OpAMD64CALLtail OpAMD64CALLclosure OpAMD64CALLinter - OpAMD64DUFFCOPY + OpAMD64LoweredMove + OpAMD64LoweredMoveLoop OpAMD64REPMOVSQ OpAMD64InvertFlags OpAMD64LoweredGetG @@ -13966,17 +13967,35 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "DUFFCOPY", - auxType: auxInt64, - argLen: 3, - clobberFlags: true, - unsafePoint: true, + name: "LoweredMove", + auxType: auxInt64, + argLen: 3, + faultOnNilArg0: true, + faultOnNilArg1: true, reg: regInfo{ inputs: []inputInfo{ - {0, 128}, // DI - {1, 64}, // SI + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 }, - clobbers: 65728, // SI DI X0 + clobbers: 1073741824, // X14 + }, + }, + { + name: "LoweredMoveLoop", + auxType: auxInt64, + argLen: 3, + clobberFlags: true, + needIntTemp: true, + faultOnNilArg0: true, + faultOnNilArg1: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + clobbers: 1073741824, // X14 + clobbersArg0: true, + clobbersArg1: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go index 45506d5b33..f3c1d3bd96 100644 --- a/src/cmd/compile/internal/ssa/regalloc.go +++ b/src/cmd/compile/internal/ssa/regalloc.go @@ -561,7 +561,14 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos pos = pos.WithNotStmt() // Check if v is already in a requested register. if mask&vi.regs != 0 { - r := pickReg(mask & vi.regs) + mask &= vi.regs + r := pickReg(mask) + if mask.contains(s.SPReg) { + // Prefer the stack pointer if it is allowed. + // (Needed because the op might have an Aux symbol + // that needs SP as its base.) + r = s.SPReg + } if !s.allocatable.contains(r) { return v // v is in a fixed register } diff --git a/src/cmd/compile/internal/ssa/regalloc_test.go b/src/cmd/compile/internal/ssa/regalloc_test.go index e7ed416c50..0f69b852d1 100644 --- a/src/cmd/compile/internal/ssa/regalloc_test.go +++ b/src/cmd/compile/internal/ssa/regalloc_test.go @@ -240,6 +240,30 @@ func TestClobbersArg0(t *testing.T) { } } +func TestClobbersArg1(t *testing.T) { + c := testConfig(t) + f := c.Fun("entry", + Bloc("entry", + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("src", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())), + Valu("dst", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())), + Valu("use1", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())), + Valu("use2", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())), + Valu("move", OpAMD64LoweredMoveLoop, types.TypeMem, 256, nil, "dst", "src", "mem"), + Valu("store1", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use1", "src", "move"), + Valu("store2", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use2", "dst", "store1"), + Exit("store2"))) + flagalloc(f.f) + regalloc(f.f) + checkFunc(f.f) + // LoweredMoveLoop clobbers its arguments, so there must be a copy of "src" and "dst" somewhere + // so we still have that value available at the stores. + if n := numCopies(f.blocks["entry"]); n != 2 { + fmt.Printf("%s\n", f.f.String()) + t.Errorf("got %d copies, want 2", n) + } +} + func numSpills(b *Block) int { return numOps(b, OpStoreReg) } diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 2239927521..236a3f885a 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -31,6 +31,7 @@ const ( removeDeadValues = true repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing + repMoveThreshold = 1408 // size beyond which we use REP MOVS for copying ) // deadcode indicates whether rewrite should try to remove any values that become dead. diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index c83890aee6..a7ee632ae1 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -27307,75 +27307,6 @@ func rewriteValueAMD64_OpMove(v *Value) bool { v.AddArg3(dst, v0, mem) return true } - // match: (Move [32] dst src mem) - // result: (Move [16] (OffPtr dst [16]) (OffPtr src [16]) (Move [16] dst src mem)) - for { - if auxIntToInt64(v.AuxInt) != 32 { - break - } - dst := v_0 - src := v_1 - mem := v_2 - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(16) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(16) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(16) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) - v2.AuxInt = int64ToAuxInt(16) - v2.AddArg3(dst, src, mem) - v.AddArg3(v0, v1, v2) - return true - } - // match: (Move [48] dst src mem) - // result: (Move [32] (OffPtr dst [16]) (OffPtr src [16]) (Move [16] dst src mem)) - for { - if auxIntToInt64(v.AuxInt) != 48 { - break - } - dst := v_0 - src := v_1 - mem := v_2 - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(32) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(16) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(16) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) - v2.AuxInt = int64ToAuxInt(16) - v2.AddArg3(dst, src, mem) - v.AddArg3(v0, v1, v2) - return true - } - // match: (Move [64] dst src mem) - // result: (Move [32] (OffPtr dst [32]) (OffPtr src [32]) (Move [32] dst src mem)) - for { - if auxIntToInt64(v.AuxInt) != 64 { - break - } - dst := v_0 - src := v_1 - mem := v_2 - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(32) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(32) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(32) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) - v2.AuxInt = int64ToAuxInt(32) - v2.AddArg3(dst, src, mem) - v.AddArg3(v0, v1, v2) - return true - } // match: (Move [3] dst src mem) // result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem)) for { @@ -27568,82 +27499,72 @@ func rewriteValueAMD64_OpMove(v *Value) bool { return true } // match: (Move [s] dst src mem) - // cond: s > 16 && s%16 != 0 && s%16 <= 8 - // result: (Move [s-s%16] (OffPtr dst [s%16]) (OffPtr src [s%16]) (MOVQstore dst (MOVQload src mem) mem)) + // cond: s > 16 && s < 192 && logLargeCopy(v, s) + // result: (LoweredMove [s] dst src mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s > 16 && s%16 != 0 && s%16 <= 8) { + if !(s > 16 && s < 192 && logLargeCopy(v, s)) { break } - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(s - s%16) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(s % 16) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(s % 16) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem) - v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64) - v3.AddArg2(src, mem) - v2.AddArg3(dst, v3, mem) - v.AddArg3(v0, v1, v2) + v.reset(OpAMD64LoweredMove) + v.AuxInt = int64ToAuxInt(s) + v.AddArg3(dst, src, mem) return true } // match: (Move [s] dst src mem) - // cond: s > 16 && s%16 != 0 && s%16 > 8 - // result: (Move [s-s%16] (OffPtr dst [s%16]) (OffPtr src [s%16]) (MOVOstore dst (MOVOload src mem) mem)) + // cond: s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) + // result: (LoweredMoveLoop [s] dst src mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s > 16 && s%16 != 0 && s%16 > 8) { + if !(s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)) { break } - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(s - s%16) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(s % 16) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(s % 16) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpAMD64MOVOstore, types.TypeMem) - v3 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128) - v3.AddArg2(src, mem) - v2.AddArg3(dst, v3, mem) - v.AddArg3(v0, v1, v2) + v.reset(OpAMD64LoweredMoveLoop) + v.AuxInt = int64ToAuxInt(s) + v.AddArg3(dst, src, mem) return true } // match: (Move [s] dst src mem) - // cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s) - // result: (DUFFCOPY [s] dst src mem) + // cond: s > repMoveThreshold && s%8 != 0 + // result: (Move [s-s%8] (OffPtr dst [s%8]) (OffPtr src [s%8]) (MOVQstore dst (MOVQload src mem) mem)) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) { + if !(s > repMoveThreshold && s%8 != 0) { break } - v.reset(OpAMD64DUFFCOPY) - v.AuxInt = int64ToAuxInt(s) - v.AddArg3(dst, src, mem) + v.reset(OpMove) + v.AuxInt = int64ToAuxInt(s - s%8) + v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) + v0.AuxInt = int64ToAuxInt(s % 8) + v0.AddArg(dst) + v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) + v1.AuxInt = int64ToAuxInt(s % 8) + v1.AddArg(src) + v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem) + v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64) + v3.AddArg2(src, mem) + v2.AddArg3(dst, v3, mem) + v.AddArg3(v0, v1, v2) return true } // match: (Move [s] dst src mem) - // cond: s > 16*64 && s%8 == 0 && logLargeCopy(v, s) + // cond: s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) // result: (REPMOVSQ dst src (MOVQconst [s/8]) mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s > 16*64 && s%8 == 0 && logLargeCopy(v, s)) { + if !(s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s)) { break } v.reset(OpAMD64REPMOVSQ) -- 2.51.0