p.To.Sym = ir.Syms.Duffzero
p.To.Offset = v.AuxInt
case ssa.OpLOONG64LoweredZero:
- // MOVx R0, (Rarg0)
- // ADDV $sz, Rarg0
- // BGEU Rarg1, Rarg0, -2(PC)
- mov, sz := largestMove(v.AuxInt)
- p := s.Prog(mov)
- p.From.Type = obj.TYPE_REG
- p.From.Reg = loong64.REGZERO
- p.To.Type = obj.TYPE_MEM
- p.To.Reg = v.Args[0].Reg()
+ ptrReg := v.Args[0].Reg()
+ n := v.AuxInt
+ if n < 16 {
+ v.Fatalf("Zero too small %d", n)
+ }
- p2 := s.Prog(loong64.AADDVU)
- p2.From.Type = obj.TYPE_CONST
- p2.From.Offset = sz
- p2.To.Type = obj.TYPE_REG
- p2.To.Reg = v.Args[0].Reg()
+ // Generate Zeroing instructions.
+ var off int64
+ for n >= 8 {
+ // MOVV ZR, off(ptrReg)
+ zero8(s, ptrReg, off)
+ off += 8
+ n -= 8
+ }
+ if n != 0 {
+ // MOVV ZR, off+n-8(ptrReg)
+ zero8(s, ptrReg, off+n-8)
+ }
+ case ssa.OpLOONG64LoweredZeroLoop:
+ ptrReg := v.Args[0].Reg()
+ countReg := v.RegTmp()
+ var off int64
+ n := v.AuxInt
+ loopSize := int64(64)
+ if n < 3*loopSize {
+ // - a loop count of 0 won't work.
+ // - a loop count of 1 is useless.
+ // - a loop count of 2 is a code size ~tie
+ // 4 instructions to implement the loop
+ // 8 instructions in the loop body
+ // vs
+ // 16 instuctions in the straightline code
+ // Might as well use straightline code.
+ v.Fatalf("ZeroLoop size tool small %d", n)
+ }
- p3 := s.Prog(loong64.ABGEU)
- p3.From.Type = obj.TYPE_REG
- p3.From.Reg = v.Args[1].Reg()
- p3.Reg = v.Args[0].Reg()
- p3.To.Type = obj.TYPE_BRANCH
- p3.To.SetTarget(p)
+ // Put iteration count in a register.
+ // MOVV $n/loopSize, countReg
+ p := s.Prog(loong64.AMOVV)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = n / loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+ cntInit := p
+
+ // Zero loopSize bytes starting at ptrReg.
+ for range loopSize / 8 {
+ // MOVV ZR, off(ptrReg)
+ zero8(s, ptrReg, off)
+ off += 8
+ }
+
+ // Increment ptrReg by loopSize.
+ // ADDV $loopSize, ptrReg
+ p = s.Prog(loong64.AADDV)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ptrReg
+
+ // Decrement loop count.
+ // SUBV $1, countReg
+ p = s.Prog(loong64.ASUBV)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 1
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+
+ // Jump to loop header if we're not done yet.
+ // BNE countReg, loop header
+ p = s.Prog(loong64.ABNE)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = countReg
+ p.To.Type = obj.TYPE_BRANCH
+ p.To.SetTarget(cntInit.Link)
+
+ // Multiples of the loop size are now done.
+ n %= loopSize
+
+ off = 0
+ // Write any fractional portion.
+ for n >= 8 {
+ // MOVV ZR, off(ptrReg)
+ zero8(s, ptrReg, off)
+ off += 8
+ n -= 8
+ }
+
+ if n != 0 {
+ zero8(s, ptrReg, off+n-8)
+ }
case ssa.OpLOONG64DUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
p.Pos = p.Pos.WithNotStmt()
return p
}
+
+// zero8 zeroes 8 bytes at reg+off.
+func zero8(s *ssagen.State, reg int16, off int64) {
+ // MOVV ZR, off(reg)
+ p := s.Prog(loong64.AMOVV)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = loong64.REGZERO
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = reg
+ p.To.Offset = off
+}
(MOVVstore [8] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
-// strip off fractional word zeroing
-(Zero [s] ptr mem) && s%8 != 0 && s > 16 =>
- (Zero [s%8]
- (OffPtr <ptr.Type> ptr [s-s%8])
- (Zero [s-s%8] ptr mem))
-
-// medium zeroing uses a duff device
-(Zero [s] ptr mem)
- && s%8 == 0 && s > 16 && s <= 8*128 =>
- (DUFFZERO [8 * (128 - s/8)] ptr mem)
-
-// large zeroing uses a loop
-(Zero [s] ptr mem)
- && s%8 == 0 && s > 8*128 =>
- (LoweredZero
- ptr
- (ADDVconst <ptr.Type> ptr [s-8])
- mem)
+(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem)
// moves
(Move [0] _ _ mem) => mem
faultOnNilArg0: true,
},
+ // medium zeroing
+ // arg0 = address of memory to zero
+ // arg1 = mem
+ // auxint = number of bytes to zero
+ // returns mem
+ {
+ name: "LoweredZero",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp},
+ },
+ faultOnNilArg0: true,
+ },
+
// duffcopy
// arg0 = address of dst memory (in R21, changed as side effect)
// arg1 = address of src memory (in R20, changed as side effect)
faultOnNilArg1: true,
},
- // large or unaligned zeroing
- // arg0 = address of memory to zero (in R20, changed as side effect)
- // arg1 = address of the last element to zero
- // arg2 = mem
- // auxint = alignment
+ // large zeroing
+ // arg0 = address of memory to zero
+ // arg1 = mem
+ // auxint = number of bytes to zero
// returns mem
- // MOVx R0, (R20)
- // ADDV $sz, R20
- // BGEU Rarg1, R20, -2(PC)
{
- name: "LoweredZero",
+ name: "LoweredZeroLoop",
aux: "Int64",
- argLength: 3,
+ argLength: 2,
reg: regInfo{
- inputs: []regMask{buildReg("R20"), gp},
- clobbers: buildReg("R20"),
+ inputs: []regMask{gp},
+ clobbersArg0: true,
},
- typ: "Mem",
faultOnNilArg0: true,
+ needIntTemp: true,
},
// large or unaligned move
OpLOONG64CALLclosure
OpLOONG64CALLinter
OpLOONG64DUFFZERO
- OpLOONG64DUFFCOPY
OpLOONG64LoweredZero
+ OpLOONG64DUFFCOPY
+ OpLOONG64LoweredZeroLoop
OpLOONG64LoweredMove
OpLOONG64LoweredAtomicLoad8
OpLOONG64LoweredAtomicLoad32
clobbers: 524290, // R1 R20
},
},
+ {
+ name: "LoweredZero",
+ auxType: auxInt64,
+ argLen: 2,
+ faultOnNilArg0: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+ },
+ },
+ },
{
name: "DUFFCOPY",
auxType: auxInt64,
},
},
{
- name: "LoweredZero",
+ name: "LoweredZeroLoop",
auxType: auxInt64,
- argLen: 3,
+ argLen: 2,
+ needIntTemp: true,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 524288}, // R20
- {1, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+ {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
},
- clobbers: 524288, // R20
+ clobbersArg0: true,
},
},
{
return true
}
// match: (Zero [s] ptr mem)
- // cond: s%8 != 0 && s > 16
- // result: (Zero [s%8] (OffPtr <ptr.Type> ptr [s-s%8]) (Zero [s-s%8] ptr mem))
- for {
- s := auxIntToInt64(v.AuxInt)
- ptr := v_0
- mem := v_1
- if !(s%8 != 0 && s > 16) {
- break
- }
- v.reset(OpZero)
- v.AuxInt = int64ToAuxInt(s % 8)
- v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
- v0.AuxInt = int64ToAuxInt(s - s%8)
- v0.AddArg(ptr)
- v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
- v1.AuxInt = int64ToAuxInt(s - s%8)
- v1.AddArg2(ptr, mem)
- v.AddArg2(v0, v1)
- return true
- }
- // match: (Zero [s] ptr mem)
- // cond: s%8 == 0 && s > 16 && s <= 8*128
- // result: (DUFFZERO [8 * (128 - s/8)] ptr mem)
+ // cond: s > 16 && s < 192
+ // result: (LoweredZero [s] ptr mem)
for {
s := auxIntToInt64(v.AuxInt)
ptr := v_0
mem := v_1
- if !(s%8 == 0 && s > 16 && s <= 8*128) {
+ if !(s > 16 && s < 192) {
break
}
- v.reset(OpLOONG64DUFFZERO)
- v.AuxInt = int64ToAuxInt(8 * (128 - s/8))
+ v.reset(OpLOONG64LoweredZero)
+ v.AuxInt = int64ToAuxInt(s)
v.AddArg2(ptr, mem)
return true
}
// match: (Zero [s] ptr mem)
- // cond: s%8 == 0 && s > 8*128
- // result: (LoweredZero ptr (ADDVconst <ptr.Type> ptr [s-8]) mem)
+ // cond: s >= 192
+ // result: (LoweredZeroLoop [s] ptr mem)
for {
s := auxIntToInt64(v.AuxInt)
ptr := v_0
mem := v_1
- if !(s%8 == 0 && s > 8*128) {
+ if !(s >= 192) {
break
}
- v.reset(OpLOONG64LoweredZero)
- v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, ptr.Type)
- v0.AuxInt = int64ToAuxInt(s - 8)
- v0.AddArg(ptr)
- v.AddArg3(ptr, v0, mem)
+ v.reset(OpLOONG64LoweredZeroLoop)
+ v.AuxInt = int64ToAuxInt(s)
+ v.AddArg2(ptr, mem)
return true
}
return false