From: Keith Randall Date: Thu, 5 Jun 2025 04:49:08 +0000 (-0700) Subject: cmd/compile: use generated loops instead of DUFFCOPY on arm64 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=15d6dbc05cd8d9b71842a9e95730fd9a285f2580;p=gostls13.git cmd/compile: use generated loops instead of DUFFCOPY on arm64 Change-Id: Ic2aa8959b7fc594b86def70b6c2be38badf7970c Reviewed-on: https://go-review.googlesource.com/c/go/+/679015 Reviewed-by: Keith Randall Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI Reviewed-by: Jorropo --- diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index cd0c2cdfaa..83293db9b9 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -1162,41 +1162,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // BNE loop // There's a past-the-end pointer here, any problem with that? - case ssa.OpARM64DUFFCOPY: - p := s.Prog(obj.ADUFFCOPY) - p.To.Type = obj.TYPE_MEM - p.To.Name = obj.NAME_EXTERN - p.To.Sym = ir.Syms.Duffcopy - p.To.Offset = v.AuxInt case ssa.OpARM64LoweredMove: - // LDP.P 16(R16), (R25, Rtmp) - // STP.P (R25, Rtmp), 16(R17) - // CMP Rarg2, R16 - // BLE -3(PC) - // arg2 is the address of the last element of src - p := s.Prog(arm64.ALDP) - p.Scond = arm64.C_XPOST - p.From.Type = obj.TYPE_MEM - p.From.Reg = arm64.REG_R16 - p.From.Offset = 16 - p.To.Type = obj.TYPE_REGREG - p.To.Reg = arm64.REG_R25 - p.To.Offset = int64(arm64.REGTMP) - p2 := s.Prog(arm64.ASTP) - p2.Scond = arm64.C_XPOST - p2.From.Type = obj.TYPE_REGREG - p2.From.Reg = arm64.REG_R25 - p2.From.Offset = int64(arm64.REGTMP) - p2.To.Type = obj.TYPE_MEM - p2.To.Reg = arm64.REG_R17 - p2.To.Offset = 16 - p3 := s.Prog(arm64.ACMP) - p3.From.Type = obj.TYPE_REG - p3.From.Reg = v.Args[2].Reg() - p3.Reg = arm64.REG_R16 - p4 := s.Prog(arm64.ABLE) - p4.To.Type = obj.TYPE_BRANCH - p4.To.SetTarget(p) + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + if dstReg == srcReg { + break + } + tmpReg1 := int16(arm64.REG_R24) + tmpReg2 := int16(arm64.REG_R25) + n := v.AuxInt + if n < 16 { + v.Fatalf("Move too small %d", n) + } + + // Generate copying instructions. + var off int64 + for n >= 16 { + // LDP off(srcReg), (tmpReg1, tmpReg2) + // STP (tmpReg1, tmpReg2), off(dstReg) + move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false) + off += 16 + n -= 16 + } + if n > 8 { + // MOVD off(srcReg), tmpReg1 + // MOVD tmpReg1, off(dstReg) + move8(s, srcReg, dstReg, tmpReg1, off) + off += 8 + n -= 8 + } + if n != 0 { + // MOVD off+n-8(srcReg), tmpReg1 + // MOVD tmpReg1, off+n-8(dstReg) + move8(s, srcReg, dstReg, tmpReg1, off+n-8) + } + case ssa.OpARM64LoweredMoveLoop: + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + if dstReg == srcReg { + break + } + countReg := int16(arm64.REG_R23) + tmpReg1 := int16(arm64.REG_R24) + tmpReg2 := int16(arm64.REG_R25) + n := v.AuxInt + loopSize := int64(64) + if n < 3*loopSize { + // - a loop count of 0 won't work. + // - a loop count of 1 is useless. + // - a loop count of 2 is a code size ~tie + // 3 instructions to implement the loop + // 4 instructions in the loop body + // vs + // 8 instructions in the straightline code + // Might as well use straightline code. + v.Fatalf("ZeroLoop size too small %d", n) + } + + // Put iteration count in a register. + // MOVD $n, countReg + p := s.Prog(arm64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n / loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + cntInit := p + + // Move loopSize bytes starting at srcReg to dstReg. + // Increment srcReg and destReg by loopSize as a side effect. + for range loopSize / 16 { + // LDP.P 16(srcReg), (tmpReg1, tmpReg2) + // STP.P (tmpReg1, tmpReg2), 16(dstReg) + move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true) + } + // Decrement loop count. + // SUB $1, countReg + p = s.Prog(arm64.ASUB) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 1 + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg + // Jump to loop header if we're not done yet. + // CBNZ head + p = s.Prog(arm64.ACBNZ) + p.From.Type = obj.TYPE_REG + p.From.Reg = countReg + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(cntInit.Link) + + // Multiples of the loop size are now done. + n %= loopSize + + // Copy any fractional portion. + var off int64 + for n >= 16 { + // LDP off(srcReg), (tmpReg1, tmpReg2) + // STP (tmpReg1, tmpReg2), off(dstReg) + move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false) + off += 16 + n -= 16 + } + if n > 8 { + // MOVD off(srcReg), tmpReg1 + // MOVD tmpReg1, off(dstReg) + move8(s, srcReg, dstReg, tmpReg1, off) + off += 8 + n -= 8 + } + if n != 0 { + // MOVD off+n-8(srcReg), tmpReg1 + // MOVD tmpReg1, off+n-8(dstReg) + move8(s, srcReg, dstReg, tmpReg1, off+n-8) + } + case ssa.OpARM64CALLstatic, ssa.OpARM64CALLclosure, ssa.OpARM64CALLinter: s.Call(v) case ssa.OpARM64CALLtail: @@ -1599,3 +1677,53 @@ func zero8(s *ssagen.State, reg int16, off int64) { p.To.Reg = reg p.To.Offset = off } + +// move16 copies 16 bytes at src+off to dst+off. +// Uses registers tmp1 and tmp2. +// If postInc is true, increment src and dst by 16. +func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) { + // LDP off(src), (tmp1, tmp2) + ld := s.Prog(arm64.ALDP) + ld.From.Type = obj.TYPE_MEM + ld.From.Reg = src + ld.From.Offset = off + ld.To.Type = obj.TYPE_REGREG + ld.To.Reg = tmp1 + ld.To.Offset = int64(tmp2) + // STP (tmp1, tmp2), off(dst) + st := s.Prog(arm64.ASTP) + st.From.Type = obj.TYPE_REGREG + st.From.Reg = tmp1 + st.From.Offset = int64(tmp2) + st.To.Type = obj.TYPE_MEM + st.To.Reg = dst + st.To.Offset = off + if postInc { + if off != 0 { + panic("can't postinc with non-zero offset") + } + ld.Scond = arm64.C_XPOST + st.Scond = arm64.C_XPOST + ld.From.Offset = 16 + st.To.Offset = 16 + } +} + +// move8 copies 8 bytes at src+off to dst+off. +// Uses register tmp. +func move8(s *ssagen.State, src, dst, tmp int16, off int64) { + // MOVD off(src), tmp + ld := s.Prog(arm64.AMOVD) + ld.From.Type = obj.TYPE_MEM + ld.From.Reg = src + ld.From.Offset = off + ld.To.Type = obj.TYPE_REG + ld.To.Reg = tmp + // MOVD tmp, off(dst) + st := s.Prog(arm64.AMOVD) + st.From.Type = obj.TYPE_REG + st.From.Reg = tmp + st.To.Type = obj.TYPE_MEM + st.To.Reg = dst + st.To.Offset = off +} diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64.rules b/src/cmd/compile/internal/ssa/_gen/ARM64.rules index 197db974b2..f54a692725 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules @@ -462,39 +462,8 @@ (STP [16] dst (Select0 (LDP [16] src mem)) (Select1 (LDP [16] src mem)) (STP dst (Select0 (LDP src mem)) (Select1 (LDP src mem)) mem)))) -// strip off fractional word move -(Move [s] dst src mem) && s%16 != 0 && s%16 <= 8 && s > 64 => - (Move [8] - (OffPtr dst [s-8]) - (OffPtr src [s-8]) - (Move [s-s%16] dst src mem)) -(Move [s] dst src mem) && s%16 != 0 && s%16 > 8 && s > 64 => - (Move [16] - (OffPtr dst [s-16]) - (OffPtr src [s-16]) - (Move [s-s%16] dst src mem)) - -// medium move uses a duff device -(Move [s] dst src mem) - && s > 64 && s <= 16*64 && s%16 == 0 - && logLargeCopy(v, s) => - (DUFFCOPY [8 * (64 - s/16)] dst src mem) -// 8 is the number of bytes to encode: -// -// LDP.P 16(R16), (R26, R27) -// STP.P (R26, R27), 16(R17) -// -// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy - -// large move uses a loop -(Move [s] dst src mem) - && s%16 == 0 && s > 16*64 - && logLargeCopy(v, s) => - (LoweredMove - dst - src - (ADDconst src [s-16]) - mem) +(Move [s] dst src mem) && s > 64 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem) +(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem) // calls (StaticCall ...) => (CALLstatic ...) diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go index 072cc2f4c8..51aa37886b 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go @@ -144,6 +144,8 @@ func init() { gpspsbg = gpspg | buildReg("SB") fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31") callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g + r24to25 = buildReg("R24 R25") + r23to25 = buildReg("R23 R24 R25") rz = buildReg("ZERO") first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15") ) @@ -568,47 +570,40 @@ func init() { needIntTemp: true, }, - // duffcopy - // arg0 = address of dst memory (in R21, changed as side effect) - // arg1 = address of src memory (in R20, changed as side effect) + // medium copying + // arg0 = address of dst memory + // arg1 = address of src memory // arg2 = mem - // auxint = offset into duffcopy code to start executing + // auxint = # of bytes to copy // returns mem - // R20, R21 changed as side effect - // R16 and R17 may be clobbered by linker trampoline. { - name: "DUFFCOPY", + name: "LoweredMove", aux: "Int64", argLength: 3, reg: regInfo{ - inputs: []regMask{buildReg("R21"), buildReg("R20")}, - clobbers: buildReg("R16 R17 R20 R21 R26 R30"), + inputs: []regMask{gp &^ r24to25, gp &^ r24to25}, + clobbers: r24to25, // TODO: figure out needIntTemp x2 }, - //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point - //faultOnNilArg1: true, - unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts + faultOnNilArg0: true, + faultOnNilArg1: true, }, - // large move - // arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect) - // arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect) - // arg2 = address of the last element of src - // arg3 = mem + // large copying + // arg0 = address of dst memory + // arg1 = address of src memory + // arg2 = mem + // auxint = # of bytes to copy // returns mem - // LDP.P 16(R16), (R25, Rtmp) - // STP.P (R25, Rtmp), 16(R17) - // CMP Rarg2, R16 - // BLE -3(PC) - // Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled. - // the-end-of-src - 16 is within the area to copy, ok to spill. { - name: "LoweredMove", - argLength: 4, + name: "LoweredMoveLoop", + aux: "Int64", + argLength: 3, reg: regInfo{ - inputs: []regMask{buildReg("R17"), buildReg("R16"), gp &^ buildReg("R25")}, - clobbers: buildReg("R16 R17 R25"), + inputs: []regMask{gp &^ r23to25, gp &^ r23to25}, + clobbers: r23to25, // TODO: figure out needIntTemp x3 + clobbersArg0: true, + clobbersArg1: true, }, - clobberFlags: true, faultOnNilArg0: true, faultOnNilArg1: true, }, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 215f0ae43a..3536cfcd75 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1720,8 +1720,8 @@ const ( OpARM64GreaterEqualNoov OpARM64LoweredZero OpARM64LoweredZeroLoop - OpARM64DUFFCOPY OpARM64LoweredMove + OpARM64LoweredMoveLoop OpARM64LoweredGetClosurePtr OpARM64LoweredGetCallerSP OpARM64LoweredGetCallerPC @@ -23096,31 +23096,33 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "DUFFCOPY", - auxType: auxInt64, - argLen: 3, - unsafePoint: true, + name: "LoweredMove", + auxType: auxInt64, + argLen: 3, + faultOnNilArg0: true, + faultOnNilArg1: true, reg: regInfo{ inputs: []inputInfo{ - {0, 1048576}, // R21 - {1, 524288}, // R20 + {0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 + {1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 }, - clobbers: 303759360, // R16 R17 R20 R21 R26 R30 + clobbers: 25165824, // R24 R25 }, }, { - name: "LoweredMove", - argLen: 4, - clobberFlags: true, + name: "LoweredMoveLoop", + auxType: auxInt64, + argLen: 3, faultOnNilArg0: true, faultOnNilArg1: true, reg: regInfo{ inputs: []inputInfo{ - {0, 131072}, // R17 - {1, 65536}, // R16 - {2, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30 + {0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30 + {1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30 }, - clobbers: 16973824, // R16 R17 R25 + clobbers: 29360128, // R23 R24 R25 + clobbersArg0: true, + clobbersArg1: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 59d6fe64db..6af1558833 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -19688,87 +19688,35 @@ func rewriteValueARM64_OpMove(v *Value) bool { return true } // match: (Move [s] dst src mem) - // cond: s%16 != 0 && s%16 <= 8 && s > 64 - // result: (Move [8] (OffPtr dst [s-8]) (OffPtr src [s-8]) (Move [s-s%16] dst src mem)) + // cond: s > 64 && s < 192 && logLargeCopy(v, s) + // result: (LoweredMove [s] dst src mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s%16 != 0 && s%16 <= 8 && s > 64) { + if !(s > 64 && s < 192 && logLargeCopy(v, s)) { break } - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(8) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(s - 8) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(s - 8) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) - v2.AuxInt = int64ToAuxInt(s - s%16) - v2.AddArg3(dst, src, mem) - v.AddArg3(v0, v1, v2) - return true - } - // match: (Move [s] dst src mem) - // cond: s%16 != 0 && s%16 > 8 && s > 64 - // result: (Move [16] (OffPtr dst [s-16]) (OffPtr src [s-16]) (Move [s-s%16] dst src mem)) - for { - s := auxIntToInt64(v.AuxInt) - dst := v_0 - src := v_1 - mem := v_2 - if !(s%16 != 0 && s%16 > 8 && s > 64) { - break - } - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(16) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(s - 16) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(s - 16) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) - v2.AuxInt = int64ToAuxInt(s - s%16) - v2.AddArg3(dst, src, mem) - v.AddArg3(v0, v1, v2) - return true - } - // match: (Move [s] dst src mem) - // cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s) - // result: (DUFFCOPY [8 * (64 - s/16)] dst src mem) - for { - s := auxIntToInt64(v.AuxInt) - dst := v_0 - src := v_1 - mem := v_2 - if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) { - break - } - v.reset(OpARM64DUFFCOPY) - v.AuxInt = int64ToAuxInt(8 * (64 - s/16)) + v.reset(OpARM64LoweredMove) + v.AuxInt = int64ToAuxInt(s) v.AddArg3(dst, src, mem) return true } // match: (Move [s] dst src mem) - // cond: s%16 == 0 && s > 16*64 && logLargeCopy(v, s) - // result: (LoweredMove dst src (ADDconst src [s-16]) mem) + // cond: s >= 192 && logLargeCopy(v, s) + // result: (LoweredMoveLoop [s] dst src mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s%16 == 0 && s > 16*64 && logLargeCopy(v, s)) { + if !(s >= 192 && logLargeCopy(v, s)) { break } - v.reset(OpARM64LoweredMove) - v0 := b.NewValue0(v.Pos, OpARM64ADDconst, src.Type) - v0.AuxInt = int64ToAuxInt(s - 16) - v0.AddArg(src) - v.AddArg4(dst, src, v0, mem) + v.reset(OpARM64LoweredMoveLoop) + v.AuxInt = int64ToAuxInt(s) + v.AddArg3(dst, src, mem) return true } return false