a.Index = i
}
+// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
+// See runtime/mkduff.go.
+const (
+ dzBlocks = 16 // number of MOV/ADD blocks
+ dzBlockLen = 4 // number of clears per block
+ dzBlockSize = 23 // size of instructions in a single block
+ dzMovSize = 5 // size of single MOV instruction w/ offset
+ dzLeaqSize = 4 // size of single LEAQ instruction
+ dzClearStep = 16 // number of bytes cleared by each MOV instruction
+)
+
+func duffStart(size int64) int64 {
+ x, _ := duff(size)
+ return x
+}
+func duffAdj(size int64) int64 {
+ _, x := duff(size)
+ return x
+}
+
+// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
+// required to use the duffzero mechanism for a block of the given size.
+func duff(size int64) (int64, int64) {
+ if size < 32 || size > 1024 || size%dzClearStep != 0 {
+ panic("bad duffzero size")
+ }
+ steps := size / dzClearStep
+ blocks := steps / dzBlockLen
+ steps %= dzBlockLen
+ off := dzBlockSize * (dzBlocks - blocks)
+ var adj int64
+ if steps != 0 {
+ off -= dzLeaqSize
+ off -= dzMovSize * steps
+ adj -= dzClearStep * (dzBlockLen - steps)
+ }
+ return off, adj
+}
+
func getgFromTLS(s *ssagen.State, r int16) {
// See the comments in cmd/internal/obj/x86/obj6.go
// near CanUse1InsnTLS for a detailed explanation of these instructions.
zero16(off + n - 16)
}
- case ssa.OpAMD64LoweredMove:
- dstReg := v.Args[0].Reg()
- srcReg := v.Args[1].Reg()
- if dstReg == srcReg {
- break
- }
- tmpReg := int16(x86.REG_X14)
- n := v.AuxInt
- if n < 16 {
- v.Fatalf("Move too small %d", n)
- }
- // move 16 bytes from srcReg+off to dstReg+off.
- move16 := func(off int64) {
- move16(s, srcReg, dstReg, tmpReg, off)
- }
-
- // Generate copying instructions.
- var off int64
- for n >= 16 {
- move16(off)
- off += 16
- n -= 16
- }
- if n != 0 {
- // use partially overlapped read/write.
- // TODO: use smaller operations when we can?
- move16(off + n - 16)
- }
-
- case ssa.OpAMD64LoweredMoveLoop:
- dstReg := v.Args[0].Reg()
- srcReg := v.Args[1].Reg()
- if dstReg == srcReg {
- break
- }
- countReg := v.RegTmp()
- tmpReg := int16(x86.REG_X14)
- n := v.AuxInt
- loopSize := int64(64)
- if n < 3*loopSize {
- // - a loop count of 0 won't work.
- // - a loop count of 1 is useless.
- // - a loop count of 2 is a code size ~tie
- // 4 instructions to implement the loop
- // 4 instructions in the loop body
- // vs
- // 8 instructions in the straightline code
- // Might as well use straightline code.
- v.Fatalf("ZeroLoop size too small %d", n)
- }
- // move 16 bytes from srcReg+off to dstReg+off.
- move16 := func(off int64) {
- move16(s, srcReg, dstReg, tmpReg, off)
- }
-
- // Put iteration count in a register.
- // MOVL $n, countReg
- p := s.Prog(x86.AMOVL)
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = n / loopSize
- p.To.Type = obj.TYPE_REG
- p.To.Reg = countReg
- cntInit := p
-
- // Copy loopSize bytes starting at srcReg to dstReg.
- for i := range loopSize / 16 {
- move16(i * 16)
- }
- // ADDQ $loopSize, srcReg
- p = s.Prog(x86.AADDQ)
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = loopSize
- p.To.Type = obj.TYPE_REG
- p.To.Reg = srcReg
- // ADDQ $loopSize, dstReg
- p = s.Prog(x86.AADDQ)
- p.From.Type = obj.TYPE_CONST
- p.From.Offset = loopSize
- p.To.Type = obj.TYPE_REG
- p.To.Reg = dstReg
- // DECL countReg
- p = s.Prog(x86.ADECL)
- p.To.Type = obj.TYPE_REG
- p.To.Reg = countReg
- // Jump to loop header if we're not done yet.
- // JNE head
- p = s.Prog(x86.AJNE)
- p.To.Type = obj.TYPE_BRANCH
- p.To.SetTarget(cntInit.Link)
-
- // Multiples of the loop size are now done.
- n %= loopSize
-
- // Copy any fractional portion.
- var off int64
- for n >= 16 {
- move16(off)
- off += 16
- n -= 16
- }
- if n != 0 {
- // Use partially-overlapping copy.
- move16(off + n - 16)
+ case ssa.OpAMD64DUFFCOPY:
+ p := s.Prog(obj.ADUFFCOPY)
+ p.To.Type = obj.TYPE_ADDR
+ p.To.Sym = ir.Syms.Duffcopy
+ if v.AuxInt%16 != 0 {
+ v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
}
+ p.To.Offset = 14 * (64 - v.AuxInt/16)
+ // 14 and 64 are magic constants. 14 is the number of bytes to encode:
+ // MOVUPS (SI), X0
+ // ADDQ $16, SI
+ // MOVUPS X0, (DI)
+ // ADDQ $16, DI
+ // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
if v.Type.IsMemory() {
p.To.Reg = reg
p.To.Offset = off
}
-
-// move 16 bytes from src+off to dst+off using temporary register tmp.
-func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
- // MOVUPS off(srcReg), tmpReg
- // MOVUPS tmpReg, off(dstReg)
- p := s.Prog(x86.AMOVUPS)
- p.From.Type = obj.TYPE_MEM
- p.From.Reg = src
- p.From.Offset = off
- p.To.Type = obj.TYPE_REG
- p.To.Reg = tmp
- p = s.Prog(x86.AMOVUPS)
- p.From.Type = obj.TYPE_REG
- p.From.Reg = tmp
- p.To.Type = obj.TYPE_MEM
- p.To.Reg = dst
- p.To.Offset = off
-}
(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
(Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem)
+(Move [32] dst src mem) =>
+ (Move [16]
+ (OffPtr <dst.Type> dst [16])
+ (OffPtr <src.Type> src [16])
+ (Move [16] dst src mem))
+
+(Move [48] dst src mem) =>
+ (Move [32]
+ (OffPtr <dst.Type> dst [16])
+ (OffPtr <src.Type> src [16])
+ (Move [16] dst src mem))
+
+(Move [64] dst src mem) =>
+ (Move [32]
+ (OffPtr <dst.Type> dst [32])
+ (OffPtr <src.Type> src [32])
+ (Move [32] dst src mem))
+
(Move [3] dst src mem) =>
(MOVBstore [2] dst (MOVBload [2] src mem)
(MOVWstore dst (MOVWload src mem) mem))
(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
(MOVQstore dst (MOVQload src mem) mem))
-// Copying up to 192 bytes uses straightline code.
-(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
-
-// Copying up to ~1KB uses a small loop.
-(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
+// Adjust moves to be a multiple of 16 bytes.
+(Move [s] dst src mem)
+ && s > 16 && s%16 != 0 && s%16 <= 8 =>
+ (Move [s-s%16]
+ (OffPtr <dst.Type> dst [s%16])
+ (OffPtr <src.Type> src [s%16])
+ (MOVQstore dst (MOVQload src mem) mem))
+(Move [s] dst src mem)
+ && s > 16 && s%16 != 0 && s%16 > 8 =>
+ (Move [s-s%16]
+ (OffPtr <dst.Type> dst [s%16])
+ (OffPtr <src.Type> src [s%16])
+ (MOVOstore dst (MOVOload src mem) mem))
+
+// Medium copying uses a duff device.
+(Move [s] dst src mem)
+ && s > 64 && s <= 16*64 && s%16 == 0
+ && logLargeCopy(v, s) =>
+ (DUFFCOPY [s] dst src mem)
// Large copying uses REP MOVSQ.
-(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 =>
- (Move [s-s%8]
- (OffPtr <dst.Type> dst [s%8])
- (OffPtr <src.Type> src [s%8])
- (MOVQstore dst (MOVQload src mem) mem))
-(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) =>
+(Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) =>
(REPMOVSQ dst src (MOVQconst [s/8]) mem)
// Lowering Zero instructions
// arg0 = destination pointer
// arg1 = source pointer
// arg2 = mem
- // auxint = # of bytes to copy
+ // auxint = # of bytes to copy, must be multiple of 16
// returns memory
{
- name: "LoweredMove",
+ name: "DUFFCOPY",
aux: "Int64",
argLength: 3,
reg: regInfo{
- inputs: []regMask{gp, gp},
- clobbers: buildReg("X14"), // uses X14 as a temporary
+ inputs: []regMask{buildReg("DI"), buildReg("SI")},
+ clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
},
- faultOnNilArg0: true,
- faultOnNilArg1: true,
- },
- // arg0 = destination pointer
- // arg1 = source pointer
- // arg2 = mem
- // auxint = # of bytes to copy
- // returns memory
- {
- name: "LoweredMoveLoop",
- aux: "Int64",
- argLength: 3,
- reg: regInfo{
- inputs: []regMask{gp, gp},
- clobbers: buildReg("X14"), // uses X14 as a temporary
- clobbersArg0: true,
- clobbersArg1: true,
- },
- clobberFlags: true,
- faultOnNilArg0: true,
- faultOnNilArg1: true,
- needIntTemp: true,
+ clobberFlags: true,
+ //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
+ //faultOnNilArg1: true,
+ unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
},
// arg0 = destination pointer
OpAMD64CALLtail
OpAMD64CALLclosure
OpAMD64CALLinter
- OpAMD64LoweredMove
- OpAMD64LoweredMoveLoop
+ OpAMD64DUFFCOPY
OpAMD64REPMOVSQ
OpAMD64InvertFlags
OpAMD64LoweredGetG
},
},
{
- name: "LoweredMove",
- auxType: auxInt64,
- argLen: 3,
- faultOnNilArg0: true,
- faultOnNilArg1: true,
- reg: regInfo{
- inputs: []inputInfo{
- {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
- {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
- },
- clobbers: 1073741824, // X14
- },
- },
- {
- name: "LoweredMoveLoop",
- auxType: auxInt64,
- argLen: 3,
- clobberFlags: true,
- needIntTemp: true,
- faultOnNilArg0: true,
- faultOnNilArg1: true,
+ name: "DUFFCOPY",
+ auxType: auxInt64,
+ argLen: 3,
+ clobberFlags: true,
+ unsafePoint: true,
reg: regInfo{
inputs: []inputInfo{
- {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
- {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ {0, 128}, // DI
+ {1, 64}, // SI
},
- clobbers: 1073741824, // X14
- clobbersArg0: true,
- clobbersArg1: true,
+ clobbers: 65728, // SI DI X0
},
},
{
pos = pos.WithNotStmt()
// Check if v is already in a requested register.
if mask&vi.regs != 0 {
- mask &= vi.regs
- r := pickReg(mask)
- if mask.contains(s.SPReg) {
- // Prefer the stack pointer if it is allowed.
- // (Needed because the op might have an Aux symbol
- // that needs SP as its base.)
- r = s.SPReg
- }
+ r := pickReg(mask & vi.regs)
if !s.allocatable.contains(r) {
return v // v is in a fixed register
}
}
}
-func TestClobbersArg1(t *testing.T) {
- c := testConfig(t)
- f := c.Fun("entry",
- Bloc("entry",
- Valu("mem", OpInitMem, types.TypeMem, 0, nil),
- Valu("src", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
- Valu("dst", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
- Valu("use1", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
- Valu("use2", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
- Valu("move", OpAMD64LoweredMoveLoop, types.TypeMem, 256, nil, "dst", "src", "mem"),
- Valu("store1", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use1", "src", "move"),
- Valu("store2", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use2", "dst", "store1"),
- Exit("store2")))
- flagalloc(f.f)
- regalloc(f.f)
- checkFunc(f.f)
- // LoweredMoveLoop clobbers its arguments, so there must be a copy of "src" and "dst" somewhere
- // so we still have that value available at the stores.
- if n := numCopies(f.blocks["entry"]); n != 2 {
- fmt.Printf("%s\n", f.f.String())
- t.Errorf("got %d copies, want 2", n)
- }
-}
-
func numSpills(b *Block) int {
return numOps(b, OpStoreReg)
}
removeDeadValues = true
repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing
- repMoveThreshold = 1408 // size beyond which we use REP MOVS for copying
)
// deadcode indicates whether rewrite should try to remove any values that become dead.
v.AddArg3(dst, v0, mem)
return true
}
+ // match: (Move [32] dst src mem)
+ // result: (Move [16] (OffPtr <dst.Type> dst [16]) (OffPtr <src.Type> src [16]) (Move [16] dst src mem))
+ for {
+ if auxIntToInt64(v.AuxInt) != 32 {
+ break
+ }
+ dst := v_0
+ src := v_1
+ mem := v_2
+ v.reset(OpMove)
+ v.AuxInt = int64ToAuxInt(16)
+ v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
+ v0.AuxInt = int64ToAuxInt(16)
+ v0.AddArg(dst)
+ v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
+ v1.AuxInt = int64ToAuxInt(16)
+ v1.AddArg(src)
+ v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
+ v2.AuxInt = int64ToAuxInt(16)
+ v2.AddArg3(dst, src, mem)
+ v.AddArg3(v0, v1, v2)
+ return true
+ }
+ // match: (Move [48] dst src mem)
+ // result: (Move [32] (OffPtr <dst.Type> dst [16]) (OffPtr <src.Type> src [16]) (Move [16] dst src mem))
+ for {
+ if auxIntToInt64(v.AuxInt) != 48 {
+ break
+ }
+ dst := v_0
+ src := v_1
+ mem := v_2
+ v.reset(OpMove)
+ v.AuxInt = int64ToAuxInt(32)
+ v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
+ v0.AuxInt = int64ToAuxInt(16)
+ v0.AddArg(dst)
+ v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
+ v1.AuxInt = int64ToAuxInt(16)
+ v1.AddArg(src)
+ v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
+ v2.AuxInt = int64ToAuxInt(16)
+ v2.AddArg3(dst, src, mem)
+ v.AddArg3(v0, v1, v2)
+ return true
+ }
+ // match: (Move [64] dst src mem)
+ // result: (Move [32] (OffPtr <dst.Type> dst [32]) (OffPtr <src.Type> src [32]) (Move [32] dst src mem))
+ for {
+ if auxIntToInt64(v.AuxInt) != 64 {
+ break
+ }
+ dst := v_0
+ src := v_1
+ mem := v_2
+ v.reset(OpMove)
+ v.AuxInt = int64ToAuxInt(32)
+ v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
+ v0.AuxInt = int64ToAuxInt(32)
+ v0.AddArg(dst)
+ v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
+ v1.AuxInt = int64ToAuxInt(32)
+ v1.AddArg(src)
+ v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
+ v2.AuxInt = int64ToAuxInt(32)
+ v2.AddArg3(dst, src, mem)
+ v.AddArg3(v0, v1, v2)
+ return true
+ }
// match: (Move [3] dst src mem)
// result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem))
for {
return true
}
// match: (Move [s] dst src mem)
- // cond: s > 16 && s < 192 && logLargeCopy(v, s)
- // result: (LoweredMove [s] dst src mem)
+ // cond: s > 16 && s%16 != 0 && s%16 <= 8
+ // result: (Move [s-s%16] (OffPtr <dst.Type> dst [s%16]) (OffPtr <src.Type> src [s%16]) (MOVQstore dst (MOVQload src mem) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s > 16 && s < 192 && logLargeCopy(v, s)) {
+ if !(s > 16 && s%16 != 0 && s%16 <= 8) {
break
}
- v.reset(OpAMD64LoweredMove)
- v.AuxInt = int64ToAuxInt(s)
- v.AddArg3(dst, src, mem)
+ v.reset(OpMove)
+ v.AuxInt = int64ToAuxInt(s - s%16)
+ v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
+ v0.AuxInt = int64ToAuxInt(s % 16)
+ v0.AddArg(dst)
+ v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
+ v1.AuxInt = int64ToAuxInt(s % 16)
+ v1.AddArg(src)
+ v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem)
+ v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64)
+ v3.AddArg2(src, mem)
+ v2.AddArg3(dst, v3, mem)
+ v.AddArg3(v0, v1, v2)
return true
}
// match: (Move [s] dst src mem)
- // cond: s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)
- // result: (LoweredMoveLoop [s] dst src mem)
+ // cond: s > 16 && s%16 != 0 && s%16 > 8
+ // result: (Move [s-s%16] (OffPtr <dst.Type> dst [s%16]) (OffPtr <src.Type> src [s%16]) (MOVOstore dst (MOVOload src mem) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)) {
+ if !(s > 16 && s%16 != 0 && s%16 > 8) {
break
}
- v.reset(OpAMD64LoweredMoveLoop)
- v.AuxInt = int64ToAuxInt(s)
- v.AddArg3(dst, src, mem)
+ v.reset(OpMove)
+ v.AuxInt = int64ToAuxInt(s - s%16)
+ v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
+ v0.AuxInt = int64ToAuxInt(s % 16)
+ v0.AddArg(dst)
+ v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
+ v1.AuxInt = int64ToAuxInt(s % 16)
+ v1.AddArg(src)
+ v2 := b.NewValue0(v.Pos, OpAMD64MOVOstore, types.TypeMem)
+ v3 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128)
+ v3.AddArg2(src, mem)
+ v2.AddArg3(dst, v3, mem)
+ v.AddArg3(v0, v1, v2)
return true
}
// match: (Move [s] dst src mem)
- // cond: s > repMoveThreshold && s%8 != 0
- // result: (Move [s-s%8] (OffPtr <dst.Type> dst [s%8]) (OffPtr <src.Type> src [s%8]) (MOVQstore dst (MOVQload src mem) mem))
+ // cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)
+ // result: (DUFFCOPY [s] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s > repMoveThreshold && s%8 != 0) {
+ if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) {
break
}
- v.reset(OpMove)
- v.AuxInt = int64ToAuxInt(s - s%8)
- v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
- v0.AuxInt = int64ToAuxInt(s % 8)
- v0.AddArg(dst)
- v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
- v1.AuxInt = int64ToAuxInt(s % 8)
- v1.AddArg(src)
- v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem)
- v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64)
- v3.AddArg2(src, mem)
- v2.AddArg3(dst, v3, mem)
- v.AddArg3(v0, v1, v2)
+ v.reset(OpAMD64DUFFCOPY)
+ v.AuxInt = int64ToAuxInt(s)
+ v.AddArg3(dst, src, mem)
return true
}
// match: (Move [s] dst src mem)
- // cond: s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s)
+ // cond: s > 16*64 && s%8 == 0 && logLargeCopy(v, s)
// result: (REPMOVSQ dst src (MOVQconst [s/8]) mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
- if !(s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s)) {
+ if !(s > 16*64 && s%8 == 0 && logLargeCopy(v, s)) {
break
}
v.reset(OpAMD64REPMOVSQ)