ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
- case ssa.OpAMD64DUFFZERO:
+
+ case ssa.OpAMD64LoweredZero:
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
- off := duffStart(v.AuxInt)
- adj := duffAdj(v.AuxInt)
- var p *obj.Prog
- if adj != 0 {
- p = s.Prog(x86.ALEAQ)
- p.From.Type = obj.TYPE_MEM
- p.From.Offset = adj
- p.From.Reg = x86.REG_DI
- p.To.Type = obj.TYPE_REG
- p.To.Reg = x86.REG_DI
+ ptrReg := v.Args[0].Reg()
+ n := v.AuxInt
+ if n < 16 {
+ v.Fatalf("Zero too small %d", n)
}
- p = s.Prog(obj.ADUFFZERO)
- p.To.Type = obj.TYPE_ADDR
- p.To.Sym = ir.Syms.Duffzero
- p.To.Offset = off
+ zero16 := func(off int64) {
+ zero16(s, ptrReg, off)
+ }
+
+ // Generate zeroing instructions.
+ var off int64
+ for n >= 16 {
+ zero16(off)
+ off += 16
+ n -= 16
+ }
+ if n != 0 {
+ // use partially overlapped write.
+ // TODO: n <= 8, use smaller write?
+ zero16(off + n - 16)
+ }
+
+ case ssa.OpAMD64LoweredZeroLoop:
+ if s.ABI != obj.ABIInternal {
+ // zero X15 manually
+ opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
+ }
+ ptrReg := v.Args[0].Reg()
+ countReg := v.RegTmp()
+ n := v.AuxInt
+ loopSize := int64(64)
+ if n < 3*loopSize {
+ // - a loop count of 0 won't work.
+ // - a loop count of 1 is useless.
+ // - a loop count of 2 is a code size ~tie
+ // 4 instructions to implement the loop
+ // 4 instructions in the loop body
+ // vs
+ // 8 instructions in the straightline code
+ // Might as well use straightline code.
+ v.Fatalf("ZeroLoop size too small %d", n)
+ }
+ zero16 := func(off int64) {
+ zero16(s, ptrReg, off)
+ }
+
+ // Put iteration count in a register.
+ // MOVL $n, countReg
+ p := s.Prog(x86.AMOVL)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = n / loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+ cntInit := p
+
+ // Zero loopSize bytes starting at ptrReg.
+ for i := range loopSize / 16 {
+ zero16(i * 16)
+ }
+ // ADDQ $loopSize, ptrReg
+ p = s.Prog(x86.AADDQ)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = loopSize
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = ptrReg
+ // DECL countReg
+ p = s.Prog(x86.ADECL)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = countReg
+ // Jump to first instruction in loop if we're not done yet.
+ // JNE head
+ p = s.Prog(x86.AJNE)
+ p.To.Type = obj.TYPE_BRANCH
+ p.To.SetTarget(cntInit.Link)
+
+ // Multiples of the loop size are now done.
+ n %= loopSize
+
+ // Write any fractional portion.
+ var off int64
+ for n >= 16 {
+ zero16(off)
+ off += 16
+ n -= 16
+ }
+ if n != 0 {
+ // Use partially-overlapping write.
+ // TODO: n <= 8, use smaller write?
+ zero16(off + n - 16)
+ }
+
case ssa.OpAMD64DUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
p.To.Type = obj.TYPE_ADDR
p.Pos = p.Pos.WithNotStmt()
return p
}
+
+// zero 16 bytes at reg+off.
+func zero16(s *ssagen.State, reg int16, off int64) {
+ // MOVUPS X15, off(ptrReg)
+ p := s.Prog(x86.AMOVUPS)
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = x86.REG_X15
+ p.To.Type = obj.TYPE_MEM
+ p.To.Reg = reg
+ p.To.Offset = off
+}
(MOVQstoreconst [makeValAndOff(0,int32(s-8))] destptr
(MOVQstoreconst [makeValAndOff(0,0)] destptr mem))
-// Adjust zeros to be a multiple of 16 bytes.
-(Zero [s] destptr mem) && s%16 != 0 && s > 16 =>
- (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
- (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
-
-(Zero [16] destptr mem) =>
- (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
-(Zero [32] destptr mem) =>
- (MOVOstoreconst [makeValAndOff(0,16)] destptr
- (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
-(Zero [48] destptr mem) =>
- (MOVOstoreconst [makeValAndOff(0,32)] destptr
- (MOVOstoreconst [makeValAndOff(0,16)] destptr
- (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
-(Zero [64] destptr mem) =>
- (MOVOstoreconst [makeValAndOff(0,48)] destptr
- (MOVOstoreconst [makeValAndOff(0,32)] destptr
- (MOVOstoreconst [makeValAndOff(0,16)] destptr
- (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
-
-// Medium zeroing uses a duff device.
-(Zero [s] destptr mem)
- && s > 64 && s <= 1024 && s%16 == 0 =>
- (DUFFZERO [s] destptr mem)
+// Zeroing up to 192 bytes uses straightline code.
+(Zero [s] destptr mem) && s >= 16 && s < 192 => (LoweredZero [s] destptr mem)
+
+// Zeroing up to ~1KB uses a small loop.
+(Zero [s] destptr mem) && s >= 192 && s <= repZeroThreshold => (LoweredZeroLoop [s] destptr mem)
// Large zeroing uses REP STOSQ.
-(Zero [s] destptr mem)
- && s > 1024 && s%8 == 0 =>
+(Zero [s] destptr mem) && s > repZeroThreshold && s%8 != 0 =>
+ (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
+ (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
+(Zero [s] destptr mem) && s > repZeroThreshold && s%8 == 0 =>
(REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
// Lowering constants
// auxint = # of bytes to zero
// returns mem
{
- name: "DUFFZERO",
+ name: "LoweredZero",
aux: "Int64",
argLength: 2,
reg: regInfo{
- inputs: []regMask{buildReg("DI")},
- clobbers: buildReg("DI"),
+ inputs: []regMask{gp},
},
- //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
- unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+ faultOnNilArg0: true,
+ },
+
+ // arg0 = pointer to start of memory to zero
+ // arg1 = mem
+ // auxint = # of bytes to zero
+ // returns mem
+ {
+ name: "LoweredZeroLoop",
+ aux: "Int64",
+ argLength: 2,
+ reg: regInfo{
+ inputs: []regMask{gp},
+ clobbersArg0: true,
+ },
+ clobberFlags: true,
+ faultOnNilArg0: true,
+ needIntTemp: true,
},
// arg0 = address of memory to zero
OpAMD64MOVLstoreconstidx4
OpAMD64MOVQstoreconstidx1
OpAMD64MOVQstoreconstidx8
- OpAMD64DUFFZERO
+ OpAMD64LoweredZero
+ OpAMD64LoweredZeroLoop
OpAMD64REPSTOSQ
OpAMD64CALLstatic
OpAMD64CALLtail
},
},
{
- name: "DUFFZERO",
- auxType: auxInt64,
- argLen: 2,
- unsafePoint: true,
+ name: "LoweredZero",
+ auxType: auxInt64,
+ argLen: 2,
+ faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 128}, // DI
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "LoweredZeroLoop",
+ auxType: auxInt64,
+ argLen: 2,
+ clobberFlags: true,
+ needIntTemp: true,
+ faultOnNilArg0: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
- clobbers: 128, // DI
+ clobbersArg0: true,
},
},
{
import (
"cmd/compile/internal/types"
+ "fmt"
"testing"
)
}
+func TestClobbersArg0(t *testing.T) {
+ c := testConfig(t)
+ f := c.Fun("entry",
+ Bloc("entry",
+ Valu("mem", OpInitMem, types.TypeMem, 0, nil),
+ Valu("ptr", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
+ Valu("dst", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
+ Valu("zero", OpAMD64LoweredZeroLoop, types.TypeMem, 256, nil, "ptr", "mem"),
+ Valu("store", OpAMD64MOVQstore, types.TypeMem, 0, nil, "dst", "ptr", "zero"),
+ Exit("store")))
+ flagalloc(f.f)
+ regalloc(f.f)
+ checkFunc(f.f)
+ // LoweredZeroLoop clobbers its argument, so there must be a copy of "ptr" somewhere
+ // so we still have that value available at "store".
+ if n := numCopies(f.blocks["entry"]); n != 1 {
+ fmt.Printf("%s\n", f.f.String())
+ t.Errorf("got %d copies, want 1", n)
+ }
+}
+
func numSpills(b *Block) int {
+ return numOps(b, OpStoreReg)
+}
+func numCopies(b *Block) int {
+ return numOps(b, OpCopy)
+}
+func numOps(b *Block, op Op) int {
n := 0
for _, v := range b.Values {
- if v.Op == OpStoreReg {
+ if v.Op == op {
n++
}
}
const (
leaveDeadValues deadValueChoice = false
removeDeadValues = true
+
+ repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing
)
// deadcode indicates whether rewrite should try to remove any values that become dead.
return true
}
// match: (Zero [s] destptr mem)
- // cond: s%16 != 0 && s > 16
- // result: (Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
+ // cond: s >= 16 && s < 192
+ // result: (LoweredZero [s] destptr mem)
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
- if !(s%16 != 0 && s > 16) {
+ if !(s >= 16 && s < 192) {
break
}
- v.reset(OpZero)
- v.AuxInt = int64ToAuxInt(s - s%16)
- v0 := b.NewValue0(v.Pos, OpOffPtr, destptr.Type)
- v0.AuxInt = int64ToAuxInt(s % 16)
- v0.AddArg(destptr)
- v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
- v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
- v1.AddArg2(destptr, mem)
- v.AddArg2(v0, v1)
- return true
- }
- // match: (Zero [16] destptr mem)
- // result: (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
- for {
- if auxIntToInt64(v.AuxInt) != 16 {
- break
- }
- destptr := v_0
- mem := v_1
- v.reset(OpAMD64MOVOstoreconst)
- v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
+ v.reset(OpAMD64LoweredZero)
+ v.AuxInt = int64ToAuxInt(s)
v.AddArg2(destptr, mem)
return true
}
- // match: (Zero [32] destptr mem)
- // result: (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
- for {
- if auxIntToInt64(v.AuxInt) != 32 {
- break
- }
- destptr := v_0
- mem := v_1
- v.reset(OpAMD64MOVOstoreconst)
- v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
- v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
- v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
- v0.AddArg2(destptr, mem)
- v.AddArg2(destptr, v0)
- return true
- }
- // match: (Zero [48] destptr mem)
- // result: (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
+ // match: (Zero [s] destptr mem)
+ // cond: s >= 192 && s <= repZeroThreshold
+ // result: (LoweredZeroLoop [s] destptr mem)
for {
- if auxIntToInt64(v.AuxInt) != 48 {
- break
- }
+ s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
- v.reset(OpAMD64MOVOstoreconst)
- v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32))
- v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
- v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
- v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
- v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
- v1.AddArg2(destptr, mem)
- v0.AddArg2(destptr, v1)
- v.AddArg2(destptr, v0)
- return true
- }
- // match: (Zero [64] destptr mem)
- // result: (MOVOstoreconst [makeValAndOff(0,48)] destptr (MOVOstoreconst [makeValAndOff(0,32)] destptr (MOVOstoreconst [makeValAndOff(0,16)] destptr (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
- for {
- if auxIntToInt64(v.AuxInt) != 64 {
+ if !(s >= 192 && s <= repZeroThreshold) {
break
}
- destptr := v_0
- mem := v_1
- v.reset(OpAMD64MOVOstoreconst)
- v.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 48))
- v0 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
- v0.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 32))
- v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
- v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 16))
- v2 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
- v2.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
- v2.AddArg2(destptr, mem)
- v1.AddArg2(destptr, v2)
- v0.AddArg2(destptr, v1)
- v.AddArg2(destptr, v0)
+ v.reset(OpAMD64LoweredZeroLoop)
+ v.AuxInt = int64ToAuxInt(s)
+ v.AddArg2(destptr, mem)
return true
}
// match: (Zero [s] destptr mem)
- // cond: s > 64 && s <= 1024 && s%16 == 0
- // result: (DUFFZERO [s] destptr mem)
+ // cond: s > repZeroThreshold && s%8 != 0
+ // result: (Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8]) (MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
- if !(s > 64 && s <= 1024 && s%16 == 0) {
+ if !(s > repZeroThreshold && s%8 != 0) {
break
}
- v.reset(OpAMD64DUFFZERO)
- v.AuxInt = int64ToAuxInt(s)
- v.AddArg2(destptr, mem)
+ v.reset(OpZero)
+ v.AuxInt = int64ToAuxInt(s - s%8)
+ v0 := b.NewValue0(v.Pos, OpOffPtr, destptr.Type)
+ v0.AuxInt = int64ToAuxInt(s % 8)
+ v0.AddArg(destptr)
+ v1 := b.NewValue0(v.Pos, OpAMD64MOVOstoreconst, types.TypeMem)
+ v1.AuxInt = valAndOffToAuxInt(makeValAndOff(0, 0))
+ v1.AddArg2(destptr, mem)
+ v.AddArg2(v0, v1)
return true
}
// match: (Zero [s] destptr mem)
- // cond: s > 1024 && s%8 == 0
+ // cond: s > repZeroThreshold && s%8 == 0
// result: (REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
for {
s := auxIntToInt64(v.AuxInt)
destptr := v_0
mem := v_1
- if !(s > 1024 && s%8 == 0) {
+ if !(s > repZeroThreshold && s%8 == 0) {
break
}
v.reset(OpAMD64REPSTOSQ)
func (t *T) f() {
// amd64:-".*runtime.memclrNoHeapPointers"
- // amd64:"DUFFZERO"
+ // amd64:`MOVUPS\tX15,`
for i := range t.a {
t.a[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
- // amd64:"DUFFZERO"
+ // amd64:`MOVUPS\tX15,`
for i := range *t.a {
t.a[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
- // amd64:"DUFFZERO"
+ // amd64:`MOVUPS\tX15,`
for i := range t.a {
(*t.a)[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
- // amd64:"DUFFZERO"
+ // amd64:`MOVUPS\tX15,`
for i := range *t.a {
(*t.a)[i] = 0
}
// amd64:-".*runtime.memclrNoHeapPointers"
- // amd64:"DUFFZERO"
+ // amd64:`MOVUPS\tX15,`
for i := range t.b {
t.b[i] = 0
}