// BNE loop
// There's a past-the-end pointer here, any problem with that?
- case ssa.OpARM64DUFFCOPY:
- p := s.Prog(obj.ADUFFCOPY)
- p.To.Type = obj.TYPE_MEM
- p.To.Name = obj.NAME_EXTERN
- p.To.Sym = ir.Syms.Duffcopy
- p.To.Offset = v.AuxInt
case ssa.OpARM64LoweredMove:
- // LDP.P 16(R16), (R25, Rtmp)
- // STP.P (R25, Rtmp), 16(R17)
- // CMP Rarg2, R16
- // BLE -3(PC)
- // arg2 is the address of the last element of src
- p := s.Prog(arm64.ALDP)
- p.Scond = arm64.C_XPOST
- p.From.Type = obj.TYPE_MEM
- p.From.Reg = arm64.REG_R16
- p.From.Offset = 16
- p.To.Type = obj.TYPE_REGREG
- p.To.Reg = arm64.REG_R25
- p.To.Offset = int64(arm64.REGTMP)
- p2 := s.Prog(arm64.ASTP)
- p2.Scond = arm64.C_XPOST
- p2.From.Type = obj.TYPE_REGREG
- p2.From.Reg = arm64.REG_R25
- p2.From.Offset = int64(arm64.REGTMP)
- p2.To.Type = obj.TYPE_MEM
- p2.To.Reg = arm64.REG_R17
- p2.To.Offset = 16
- p3 := s.Prog(arm64.ACMP)
- p3.From.Type = obj.TYPE_REG
- p3.From.Reg = v.Args[2].Reg()
- p3.Reg = arm64.REG_R16
- p4 := s.Prog(arm64.ABLE)
- p4.To.Type = obj.TYPE_BRANCH
- p4.To.SetTarget(p)
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
+ if dstReg == srcReg {
+ break
+ }
+ tmpReg1 := int16(arm64.REG_R24)
+ tmpReg2 := int16(arm64.REG_R25)
+ n := v.AuxInt
+ if n < 16 {
+ v.Fatalf("Move too small %d", n)
+ }
+
+ // Generate copying instructions.
+ var off int64
+ for n >= 16 {
+ // LDP off(srcReg), (tmpReg1, tmpReg2)
+ // STP (tmpReg1, tmpReg2), off(dstReg)
+ move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
+ off += 16
+ n -= 16
+ }
+ if n > 8 {
+ // MOVD off(srcReg), tmpReg1
+ // MOVD tmpReg1, off(dstReg)
+ move8(s, srcReg, dstReg, tmpReg1, off)
+ off += 8
+ n -= 8
+ }
+ if n != 0 {
+ // MOVD off+n-8(srcReg), tmpReg1
+ // MOVD tmpReg1, off+n-8(dstReg)
+ move8(s, srcReg, dstReg, tmpReg1, off+n-8)
+ }
+ case ssa.OpARM64LoweredMoveLoop:
+ dstReg := v.Args[0].Reg()
+ srcReg := v.Args[1].Reg()
+ if dstReg == srcReg {
+ break
+ }
+ countReg := int16(arm64.REG_R23)
+ tmpReg1 := int16(arm64.REG_R24)
+ tmpReg2 := int16(arm64.REG_R25)
+ n := v.AuxInt
+ loopSize := int64(64)
+ if n < 3*loopSize {
+ // - a loop count of 0 won't work.
+ // - a loop count of 1 is useless.
+ // - a loop count of 2 is a code size ~tie
+ // 3 instructions to implement the loop
+ // 4 instructions in the loop body
+ // vs
+ // 8 instructions in the straightline code
+ // Might as well use straightline code.
+ v.Fatalf("ZeroLoop size too small %d", n)
+ }
+
+ // Put iteration count in a register.
+ // MOVD $n, countReg
+ p := s.Prog(arm64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = n / loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+ cntInit := p
+
+ // Move loopSize bytes starting at srcReg to dstReg.
+ // Increment srcReg and destReg by loopSize as a side effect.
+ for range loopSize / 16 {
+ // LDP.P 16(srcReg), (tmpReg1, tmpReg2)
+ // STP.P (tmpReg1, tmpReg2), 16(dstReg)
+ move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
+ }
+ // Decrement loop count.
+ // SUB $1, countReg
+ p = s.Prog(arm64.ASUB)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 1
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+ // Jump to loop header if we're not done yet.
+ // CBNZ head
+ p = s.Prog(arm64.ACBNZ)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = countReg
+ p.To.Type = obj.TYPE_BRANCH
+ p.To.SetTarget(cntInit.Link)
+
+ // Multiples of the loop size are now done.
+ n %= loopSize
+
+ // Copy any fractional portion.
+ var off int64
+ for n >= 16 {
+ // LDP off(srcReg), (tmpReg1, tmpReg2)
+ // STP (tmpReg1, tmpReg2), off(dstReg)
+ move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
+ off += 16
+ n -= 16
+ }
+ if n > 8 {
+ // MOVD off(srcReg), tmpReg1
+ // MOVD tmpReg1, off(dstReg)
+ move8(s, srcReg, dstReg, tmpReg1, off)
+ off += 8
+ n -= 8
+ }
+ if n != 0 {
+ // MOVD off+n-8(srcReg), tmpReg1
+ // MOVD tmpReg1, off+n-8(dstReg)
+ move8(s, srcReg, dstReg, tmpReg1, off+n-8)
+ }
+
case ssa.OpARM64CALLstatic, ssa.OpARM64CALLclosure, ssa.OpARM64CALLinter:
s.Call(v)
case ssa.OpARM64CALLtail:
p.To.Reg = reg
p.To.Offset = off
}
+
+// move16 copies 16 bytes at src+off to dst+off.
+// Uses registers tmp1 and tmp2.
+// If postInc is true, increment src and dst by 16.
+func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
+ // LDP off(src), (tmp1, tmp2)
+ ld := s.Prog(arm64.ALDP)
+ ld.From.Type = obj.TYPE_MEM
+ ld.From.Reg = src
+ ld.From.Offset = off
+ ld.To.Type = obj.TYPE_REGREG
+ ld.To.Reg = tmp1
+ ld.To.Offset = int64(tmp2)
+ // STP (tmp1, tmp2), off(dst)
+ st := s.Prog(arm64.ASTP)
+ st.From.Type = obj.TYPE_REGREG
+ st.From.Reg = tmp1
+ st.From.Offset = int64(tmp2)
+ st.To.Type = obj.TYPE_MEM
+ st.To.Reg = dst
+ st.To.Offset = off
+ if postInc {
+ if off != 0 {
+ panic("can't postinc with non-zero offset")
+ }
+ ld.Scond = arm64.C_XPOST
+ st.Scond = arm64.C_XPOST
+ ld.From.Offset = 16
+ st.To.Offset = 16
+ }
+}
+
+// move8 copies 8 bytes at src+off to dst+off.
+// Uses register tmp.
+func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
+ // MOVD off(src), tmp
+ ld := s.Prog(arm64.AMOVD)
+ ld.From.Type = obj.TYPE_MEM
+ ld.From.Reg = src
+ ld.From.Offset = off
+ ld.To.Type = obj.TYPE_REG
+ ld.To.Reg = tmp
+ // MOVD tmp, off(dst)
+ st := s.Prog(arm64.AMOVD)
+ st.From.Type = obj.TYPE_REG
+ st.From.Reg = tmp
+ st.To.Type = obj.TYPE_MEM
+ st.To.Reg = dst
+ st.To.Offset = off
+}
(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
-// strip off fractional word move
-(Move [s] dst src mem) && s%16 != 0 && s%16 <= 8 && s > 64 =>
- (Move [8]
- (OffPtr <dst.Type> dst [s-8])
- (OffPtr <src.Type> src [s-8])
- (Move [s-s%16] dst src mem))
-(Move [s] dst src mem) && s%16 != 0 && s%16 > 8 && s > 64 =>
- (Move [16]
- (OffPtr <dst.Type> dst [s-16])
- (OffPtr <src.Type> src [s-16])
- (Move [s-s%16] dst src mem))
-
-// medium move uses a duff device
-(Move [s] dst src mem)
- && s > 64 && s <= 16*64 && s%16 == 0
- && logLargeCopy(v, s) =>
- (DUFFCOPY [8 * (64 - s/16)] dst src mem)
-// 8 is the number of bytes to encode:
-//
-// LDP.P 16(R16), (R26, R27)
-// STP.P (R26, R27), 16(R17)
-//
-// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy
-
-// large move uses a loop
-(Move [s] dst src mem)
- && s%16 == 0 && s > 16*64
- && logLargeCopy(v, s) =>
- (LoweredMove
- dst
- src
- (ADDconst <src.Type> src [s-16])
- mem)
+(Move [s] dst src mem) && s > 64 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
// calls
(StaticCall ...) => (CALLstatic ...)
gpspsbg = gpspg | buildReg("SB")
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+ r24to25 = buildReg("R24 R25")
+ r23to25 = buildReg("R23 R24 R25")
rz = buildReg("ZERO")
first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
)
needIntTemp: true,
},
- // duffcopy
- // arg0 = address of dst memory (in R21, changed as side effect)
- // arg1 = address of src memory (in R20, changed as side effect)
+ // medium copying
+ // arg0 = address of dst memory
+ // arg1 = address of src memory
// arg2 = mem
- // auxint = offset into duffcopy code to start executing
+ // auxint = # of bytes to copy
// returns mem
- // R20, R21 changed as side effect
- // R16 and R17 may be clobbered by linker trampoline.
{
- name: "DUFFCOPY",
+ name: "LoweredMove",
aux: "Int64",
argLength: 3,
reg: regInfo{
- inputs: []regMask{buildReg("R21"), buildReg("R20")},
- clobbers: buildReg("R16 R17 R20 R21 R26 R30"),
+ inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
+ clobbers: r24to25, // TODO: figure out needIntTemp x2
},
- //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
- //faultOnNilArg1: true,
- unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
},
- // large move
- // arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
- // arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
- // arg2 = address of the last element of src
- // arg3 = mem
+ // large copying
+ // arg0 = address of dst memory
+ // arg1 = address of src memory
+ // arg2 = mem
+ // auxint = # of bytes to copy
// returns mem
- // LDP.P 16(R16), (R25, Rtmp)
- // STP.P (R25, Rtmp), 16(R17)
- // CMP Rarg2, R16
- // BLE -3(PC)
- // Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
- // the-end-of-src - 16 is within the area to copy, ok to spill.
{
- name: "LoweredMove",
- argLength: 4,
+ name: "LoweredMoveLoop",
+ aux: "Int64",
+ argLength: 3,
reg: regInfo{
- inputs: []regMask{buildReg("R17"), buildReg("R16"), gp &^ buildReg("R25")},
- clobbers: buildReg("R16 R17 R25"),
+ inputs: []regMask{gp &^ r23to25, gp &^ r23to25},
+ clobbers: r23to25, // TODO: figure out needIntTemp x3
+ clobbersArg0: true,
+ clobbersArg1: true,
},
- clobberFlags: true,
faultOnNilArg0: true,
faultOnNilArg1: true,
},
OpARM64GreaterEqualNoov
OpARM64LoweredZero
OpARM64LoweredZeroLoop
- OpARM64DUFFCOPY
OpARM64LoweredMove
+ OpARM64LoweredMoveLoop
OpARM64LoweredGetClosurePtr
OpARM64LoweredGetCallerSP
OpARM64LoweredGetCallerPC
},
},
{
- name: "DUFFCOPY",
- auxType: auxInt64,
- argLen: 3,
- unsafePoint: true,
+ name: "LoweredMove",
+ auxType: auxInt64,
+ argLen: 3,
+ faultOnNilArg0: true,
+ faultOnNilArg1: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 1048576}, // R21
- {1, 524288}, // R20
+ {0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
+ {1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
},
- clobbers: 303759360, // R16 R17 R20 R21 R26 R30
+ clobbers: 25165824, // R24 R25
},
},
{
- name: "LoweredMove",
- argLen: 4,
- clobberFlags: true,
+ name: "LoweredMoveLoop",
+ auxType: auxInt64,
+ argLen: 3,
faultOnNilArg0: true,
faultOnNilArg1: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 131072}, // R17
- {1, 65536}, // R16
- {2, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
+ {0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
+ {1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
},
- clobbers: 16973824, // R16 R17 R25
+ clobbers: 29360128, // R23 R24 R25
+ clobbersArg0: true,
+ clobbersArg1: true,
},
},
{
return true
}
// match: (Move [s] dst src mem)
- // cond: s%16 != 0 && s%16 <= 8 && s > 64
- // result: (Move [8] (OffPtr <dst.Type> dst [s-8]) (OffPtr <src.Type> src [s-8]) (Move [s-s%16] dst src mem))
+ // cond: s > 64 && s < 192 && logLargeCopy(v, s)
+ // result: (LoweredMove [s] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s%16 != 0 && s%16 <= 8 && s > 64) {
+ if !(s > 64 && s < 192 && logLargeCopy(v, s)) {
break
}
- v.reset(OpMove)
- v.AuxInt = int64ToAuxInt(8)
- v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
- v0.AuxInt = int64ToAuxInt(s - 8)
- v0.AddArg(dst)
- v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
- v1.AuxInt = int64ToAuxInt(s - 8)
- v1.AddArg(src)
- v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
- v2.AuxInt = int64ToAuxInt(s - s%16)
- v2.AddArg3(dst, src, mem)
- v.AddArg3(v0, v1, v2)
- return true
- }
- // match: (Move [s] dst src mem)
- // cond: s%16 != 0 && s%16 > 8 && s > 64
- // result: (Move [16] (OffPtr <dst.Type> dst [s-16]) (OffPtr <src.Type> src [s-16]) (Move [s-s%16] dst src mem))
- for {
- s := auxIntToInt64(v.AuxInt)
- dst := v_0
- src := v_1
- mem := v_2
- if !(s%16 != 0 && s%16 > 8 && s > 64) {
- break
- }
- v.reset(OpMove)
- v.AuxInt = int64ToAuxInt(16)
- v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
- v0.AuxInt = int64ToAuxInt(s - 16)
- v0.AddArg(dst)
- v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
- v1.AuxInt = int64ToAuxInt(s - 16)
- v1.AddArg(src)
- v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
- v2.AuxInt = int64ToAuxInt(s - s%16)
- v2.AddArg3(dst, src, mem)
- v.AddArg3(v0, v1, v2)
- return true
- }
- // match: (Move [s] dst src mem)
- // cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)
- // result: (DUFFCOPY [8 * (64 - s/16)] dst src mem)
- for {
- s := auxIntToInt64(v.AuxInt)
- dst := v_0
- src := v_1
- mem := v_2
- if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) {
- break
- }
- v.reset(OpARM64DUFFCOPY)
- v.AuxInt = int64ToAuxInt(8 * (64 - s/16))
+ v.reset(OpARM64LoweredMove)
+ v.AuxInt = int64ToAuxInt(s)
v.AddArg3(dst, src, mem)
return true
}
// match: (Move [s] dst src mem)
- // cond: s%16 == 0 && s > 16*64 && logLargeCopy(v, s)
- // result: (LoweredMove dst src (ADDconst <src.Type> src [s-16]) mem)
+ // cond: s >= 192 && logLargeCopy(v, s)
+ // result: (LoweredMoveLoop [s] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s%16 == 0 && s > 16*64 && logLargeCopy(v, s)) {
+ if !(s >= 192 && logLargeCopy(v, s)) {
break
}
- v.reset(OpARM64LoweredMove)
- v0 := b.NewValue0(v.Pos, OpARM64ADDconst, src.Type)
- v0.AuxInt = int64ToAuxInt(s - 16)
- v0.AddArg(src)
- v.AddArg4(dst, src, v0, mem)
+ v.reset(OpARM64LoweredMoveLoop)
+ v.AuxInt = int64ToAuxInt(s)
+ v.AddArg3(dst, src, mem)
return true
}
return false