p.From.Offset = int64(condCode)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
- case ssa.OpARM64DUFFZERO:
- // runtime.duffzero expects start address in R20
- p := s.Prog(obj.ADUFFZERO)
- p.To.Type = obj.TYPE_MEM
- p.To.Name = obj.NAME_EXTERN
- p.To.Sym = ir.Syms.Duffzero
- p.To.Offset = v.AuxInt
case ssa.OpARM64LoweredZero:
- // STP.P (ZR,ZR), 16(R16)
- // CMP Rarg1, R16
- // BLE -2(PC)
- // arg1 is the address of the last 16-byte unit to zero
- p := s.Prog(arm64.ASTP)
- p.Scond = arm64.C_XPOST
- p.From.Type = obj.TYPE_REGREG
- p.From.Reg = arm64.REGZERO
- p.From.Offset = int64(arm64.REGZERO)
- p.To.Type = obj.TYPE_MEM
- p.To.Reg = arm64.REG_R16
- p.To.Offset = 16
- p2 := s.Prog(arm64.ACMP)
- p2.From.Type = obj.TYPE_REG
- p2.From.Reg = v.Args[1].Reg()
- p2.Reg = arm64.REG_R16
- p3 := s.Prog(arm64.ABLE)
- p3.To.Type = obj.TYPE_BRANCH
- p3.To.SetTarget(p)
+ ptrReg := v.Args[0].Reg()
+ n := v.AuxInt
+ if n < 16 {
+ v.Fatalf("Zero too small %d", n)
+ }
+
+ // Generate zeroing instructions.
+ var off int64
+ for n >= 16 {
+ // STP (ZR, ZR), off(ptrReg)
+ zero16(s, ptrReg, off, false)
+ off += 16
+ n -= 16
+ }
+ // Write any fractional portion.
+ // An overlapping 16-byte write can't be used here
+ // because STP's offsets must be a multiple of 8.
+ if n > 8 {
+ // MOVD ZR, off(ptrReg)
+ zero8(s, ptrReg, off)
+ off += 8
+ n -= 8
+ }
+ if n != 0 {
+ // MOVD ZR, off+n-8(ptrReg)
+ // TODO: for n<=4 we could use a smaller write.
+ zero8(s, ptrReg, off+n-8)
+ }
+ case ssa.OpARM64LoweredZeroLoop:
+ ptrReg := v.Args[0].Reg()
+ countReg := v.RegTmp()
+ n := v.AuxInt
+ loopSize := int64(64)
+ if n < 3*loopSize {
+ // - a loop count of 0 won't work.
+ // - a loop count of 1 is useless.
+ // - a loop count of 2 is a code size ~tie
+ // 3 instructions to implement the loop
+ // 4 instructions in the loop body
+ // vs
+ // 8 instructions in the straightline code
+ // Might as well use straightline code.
+ v.Fatalf("ZeroLoop size too small %d", n)
+ }
+
+ // Put iteration count in a register.
+ // MOVD $n, countReg
+ p := s.Prog(arm64.AMOVD)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = n / loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+ cntInit := p
+
+ // Zero loopSize bytes starting at ptrReg.
+ // Increment ptrReg by loopSize as a side effect.
+ for range loopSize / 16 {
+ // STP.P (ZR, ZR), 16(ptrReg)
+ zero16(s, ptrReg, 0, true)
+ // TODO: should we use the postincrement form,
+ // or use a separate += 64 instruction?
+ // postincrement saves an instruction, but maybe
+ // it requires more integer units to do the +=16s.
+ }
+ // Decrement loop count.
+ // SUB $1, countReg
+ p = s.Prog(arm64.ASUB)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 1
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+ // Jump to loop header if we're not done yet.
+ // CBNZ head
+ p = s.Prog(arm64.ACBNZ)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = countReg
+ p.To.Type = obj.TYPE_BRANCH
+ p.To.SetTarget(cntInit.Link)
+
+ // Multiples of the loop size are now done.
+ n %= loopSize
+
+ // Write any fractional portion.
+ var off int64
+ for n >= 16 {
+ // STP (ZR, ZR), off(ptrReg)
+ zero16(s, ptrReg, off, false)
+ off += 16
+ n -= 16
+ }
+ if n > 8 {
+ // Note: an overlapping 16-byte write can't be used
+ // here because STP's offsets must be a multiple of 8.
+ // MOVD ZR, off(ptrReg)
+ zero8(s, ptrReg, off)
+ off += 8
+ n -= 8
+ }
+ if n != 0 {
+ // MOVD ZR, off+n-8(ptrReg)
+ // TODO: for n<=4 we could use a smaller write.
+ zero8(s, ptrReg, off+n-8)
+ }
+ // TODO: maybe we should use the count register to instead
+ // hold an end pointer and compare against that?
+ // ADD $n, ptrReg, endReg
+ // then
+ // CMP ptrReg, endReg
+ // BNE loop
+ // There's a past-the-end pointer here, any problem with that?
+
case ssa.OpARM64DUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
p.To.Type = obj.TYPE_MEM
p.Pos = p.Pos.WithNotStmt()
return p
}
+
+// zero16 zeroes 16 bytes at reg+off.
+// If postInc is true, increment reg by 16.
+func zero16(s *ssagen.State, reg int16, off int64, postInc bool) {
+ // STP (ZR, ZR), off(reg)
+ p := s.Prog(arm64.ASTP)
+ p.From.Type = obj.TYPE_REGREG
+ p.From.Reg = arm64.REGZERO
+ p.From.Offset = int64(arm64.REGZERO)
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = reg
+ p.To.Offset = off
+ if postInc {
+ if off != 0 {
+ panic("can't postinc with non-zero offset")
+ }
+ // STP.P (ZR, ZR), 16(reg)
+ p.Scond = arm64.C_XPOST
+ p.To.Offset = 16
+ }
+}
+
+// zero8 zeroes 8 bytes at reg+off.
+func zero8(s *ssagen.State, reg int16, off int64) {
+ // MOVD ZR, off(reg)
+ p := s.Prog(arm64.AMOVD)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = arm64.REGZERO
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = reg
+ p.To.Offset = off
+}
(Zero [16] ptr mem) =>
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
-(Zero [32] ptr mem) =>
- (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
- (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
-
-(Zero [48] ptr mem) =>
- (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
- (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
- (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
-
-(Zero [64] ptr mem) =>
- (STP [48] ptr (MOVDconst [0]) (MOVDconst [0])
- (STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
- (STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
- (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
-
-// strip off fractional word zeroing
-(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 =>
- (Zero [8]
- (OffPtr <ptr.Type> ptr [s-8])
- (Zero [s-s%16] ptr mem))
-(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 =>
- (Zero [16]
- (OffPtr <ptr.Type> ptr [s-16])
- (Zero [s-s%16] ptr mem))
-
-// medium zeroing uses a duff device
-// 4, 16, and 64 are magic constants, see runtime/mkduff.go
-(Zero [s] ptr mem)
- && s%16 == 0 && s > 64 && s <= 16*64 =>
- (DUFFZERO [4 * (64 - s/16)] ptr mem)
-
-// large zeroing uses a loop
-(Zero [s] ptr mem)
- && s%16 == 0 && s > 16*64 =>
- (LoweredZero
- ptr
- (ADDconst <ptr.Type> [s-16] ptr)
- mem)
+(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem)
// moves
(Move [0] _ _ mem) => mem
{name: "LessThanNoov", argLength: 1, reg: readflags}, // bool, true flags encode signed x<y but without honoring overflow, false otherwise.
{name: "GreaterEqualNoov", argLength: 1, reg: readflags}, // bool, true flags encode signed x>=y but without honoring overflow, false otherwise.
- // duffzero
+ // medium zeroing
// arg0 = address of memory to zero
// arg1 = mem
- // auxint = offset into duffzero code to start executing
+ // auxint = # of bytes to zero
// returns mem
- // R20 changed as side effect
- // R16 and R17 may be clobbered by linker trampoline.
{
- name: "DUFFZERO",
+ name: "LoweredZero",
aux: "Int64",
argLength: 2,
reg: regInfo{
- inputs: []regMask{buildReg("R20")},
- clobbers: buildReg("R16 R17 R20 R30"),
+ inputs: []regMask{gp},
},
- //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
- unsafePoint: true, // FP maintenance around DUFFZERO can be clobbered by interrupts
+ faultOnNilArg0: true,
},
// large zeroing
- // arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
- // arg1 = address of the last 16-byte unit to zero
- // arg2 = mem
+ // arg0 = address of memory to zero
+ // arg1 = mem
+ // auxint = # of bytes to zero
// returns mem
- // STP.P (ZR,ZR), 16(R16)
- // CMP Rarg1, R16
- // BLE -2(PC)
- // Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
- // the-end-of-the-memory - 16 is with the area to zero, ok to spill.
{
- name: "LoweredZero",
- argLength: 3,
+ name: "LoweredZeroLoop",
+ aux: "Int64",
+ argLength: 2,
reg: regInfo{
- inputs: []regMask{buildReg("R16"), gp},
- clobbers: buildReg("R16"),
+ inputs: []regMask{gp},
+ clobbersArg0: true,
},
- clobberFlags: true,
faultOnNilArg0: true,
+ needIntTemp: true,
},
// duffcopy
OpARM64NotGreaterEqualF
OpARM64LessThanNoov
OpARM64GreaterEqualNoov
- OpARM64DUFFZERO
OpARM64LoweredZero
+ OpARM64LoweredZeroLoop
OpARM64DUFFCOPY
OpARM64LoweredMove
OpARM64LoweredGetClosurePtr
},
},
{
- name: "DUFFZERO",
- auxType: auxInt64,
- argLen: 2,
- unsafePoint: true,
+ name: "LoweredZero",
+ auxType: auxInt64,
+ argLen: 2,
+ faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 524288}, // R20
+ {0, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
- clobbers: 269156352, // R16 R17 R20 R30
},
},
{
- name: "LoweredZero",
- argLen: 3,
- clobberFlags: true,
+ name: "LoweredZeroLoop",
+ auxType: auxInt64,
+ argLen: 2,
+ needIntTemp: true,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 65536}, // R16
- {1, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
+ {0, 335544319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
- clobbers: 65536, // R16
+ clobbersArg0: true,
},
},
{
v.AddArg4(ptr, v0, v0, mem)
return true
}
- // match: (Zero [32] ptr mem)
- // result: (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
- for {
- if auxIntToInt64(v.AuxInt) != 32 {
- break
- }
- ptr := v_0
- mem := v_1
- v.reset(OpARM64STP)
- v.AuxInt = int32ToAuxInt(16)
- v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
- v0.AuxInt = int64ToAuxInt(0)
- v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
- v1.AuxInt = int32ToAuxInt(0)
- v1.AddArg4(ptr, v0, v0, mem)
- v.AddArg4(ptr, v0, v0, v1)
- return true
- }
- // match: (Zero [48] ptr mem)
- // result: (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
- for {
- if auxIntToInt64(v.AuxInt) != 48 {
- break
- }
- ptr := v_0
- mem := v_1
- v.reset(OpARM64STP)
- v.AuxInt = int32ToAuxInt(32)
- v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
- v0.AuxInt = int64ToAuxInt(0)
- v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
- v1.AuxInt = int32ToAuxInt(16)
- v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
- v2.AuxInt = int32ToAuxInt(0)
- v2.AddArg4(ptr, v0, v0, mem)
- v1.AddArg4(ptr, v0, v0, v2)
- v.AddArg4(ptr, v0, v0, v1)
- return true
- }
- // match: (Zero [64] ptr mem)
- // result: (STP [48] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [32] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [16] ptr (MOVDconst [0]) (MOVDconst [0]) (STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
- for {
- if auxIntToInt64(v.AuxInt) != 64 {
- break
- }
- ptr := v_0
- mem := v_1
- v.reset(OpARM64STP)
- v.AuxInt = int32ToAuxInt(48)
- v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
- v0.AuxInt = int64ToAuxInt(0)
- v1 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
- v1.AuxInt = int32ToAuxInt(32)
- v2 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
- v2.AuxInt = int32ToAuxInt(16)
- v3 := b.NewValue0(v.Pos, OpARM64STP, types.TypeMem)
- v3.AuxInt = int32ToAuxInt(0)
- v3.AddArg4(ptr, v0, v0, mem)
- v2.AddArg4(ptr, v0, v0, v3)
- v1.AddArg4(ptr, v0, v0, v2)
- v.AddArg4(ptr, v0, v0, v1)
- return true
- }
- // match: (Zero [s] ptr mem)
- // cond: s%16 != 0 && s%16 <= 8 && s > 16
- // result: (Zero [8] (OffPtr <ptr.Type> ptr [s-8]) (Zero [s-s%16] ptr mem))
- for {
- s := auxIntToInt64(v.AuxInt)
- ptr := v_0
- mem := v_1
- if !(s%16 != 0 && s%16 <= 8 && s > 16) {
- break
- }
- v.reset(OpZero)
- v.AuxInt = int64ToAuxInt(8)
- v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
- v0.AuxInt = int64ToAuxInt(s - 8)
- v0.AddArg(ptr)
- v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
- v1.AuxInt = int64ToAuxInt(s - s%16)
- v1.AddArg2(ptr, mem)
- v.AddArg2(v0, v1)
- return true
- }
- // match: (Zero [s] ptr mem)
- // cond: s%16 != 0 && s%16 > 8 && s > 16
- // result: (Zero [16] (OffPtr <ptr.Type> ptr [s-16]) (Zero [s-s%16] ptr mem))
- for {
- s := auxIntToInt64(v.AuxInt)
- ptr := v_0
- mem := v_1
- if !(s%16 != 0 && s%16 > 8 && s > 16) {
- break
- }
- v.reset(OpZero)
- v.AuxInt = int64ToAuxInt(16)
- v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
- v0.AuxInt = int64ToAuxInt(s - 16)
- v0.AddArg(ptr)
- v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
- v1.AuxInt = int64ToAuxInt(s - s%16)
- v1.AddArg2(ptr, mem)
- v.AddArg2(v0, v1)
- return true
- }
// match: (Zero [s] ptr mem)
- // cond: s%16 == 0 && s > 64 && s <= 16*64
- // result: (DUFFZERO [4 * (64 - s/16)] ptr mem)
+ // cond: s > 16 && s < 192
+ // result: (LoweredZero [s] ptr mem)
for {
s := auxIntToInt64(v.AuxInt)
ptr := v_0
mem := v_1
- if !(s%16 == 0 && s > 64 && s <= 16*64) {
+ if !(s > 16 && s < 192) {
break
}
- v.reset(OpARM64DUFFZERO)
- v.AuxInt = int64ToAuxInt(4 * (64 - s/16))
+ v.reset(OpARM64LoweredZero)
+ v.AuxInt = int64ToAuxInt(s)
v.AddArg2(ptr, mem)
return true
}
// match: (Zero [s] ptr mem)
- // cond: s%16 == 0 && s > 16*64
- // result: (LoweredZero ptr (ADDconst <ptr.Type> [s-16] ptr) mem)
+ // cond: s >= 192
+ // result: (LoweredZeroLoop [s] ptr mem)
for {
s := auxIntToInt64(v.AuxInt)
ptr := v_0
mem := v_1
- if !(s%16 == 0 && s > 16*64) {
+ if !(s >= 192) {
break
}
- v.reset(OpARM64LoweredZero)
- v0 := b.NewValue0(v.Pos, OpARM64ADDconst, ptr.Type)
- v0.AuxInt = int64ToAuxInt(s - 16)
- v0.AddArg(ptr)
- v.AddArg3(ptr, v0, mem)
+ v.reset(OpARM64LoweredZeroLoop)
+ v.AuxInt = int64ToAuxInt(s)
+ v.AddArg2(ptr, mem)
return true
}
return false