package amd64
import (
- "cmd/compile/internal/ir"
"cmd/compile/internal/objw"
- "cmd/compile/internal/types"
"cmd/internal/obj"
"cmd/internal/obj/x86"
)
-// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
-// See runtime/mkduff.go.
-const (
- dzBlocks = 16 // number of MOV/ADD blocks
- dzBlockLen = 4 // number of clears per block
- dzBlockSize = 23 // size of instructions in a single block
- dzMovSize = 5 // size of single MOV instruction w/ offset
- dzLeaqSize = 4 // size of single LEAQ instruction
- dzClearStep = 16 // number of bytes cleared by each MOV instruction
-
- dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
- dzSize = dzBlocks * dzBlockSize
-)
-
-// dzOff returns the offset for a jump into DUFFZERO.
-// b is the number of bytes to zero.
-func dzOff(b int64) int64 {
- off := int64(dzSize)
- off -= b / dzClearLen * dzBlockSize
- tailLen := b % dzClearLen
- if tailLen >= dzClearStep {
- off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep)
- }
- return off
-}
-
-// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
-// b is the number of bytes to zero.
-func dzDI(b int64) int64 {
- tailLen := b % dzClearLen
- if tailLen < dzClearStep {
- return 0
- }
- tailSteps := tailLen / dzClearStep
- return -dzClearStep * (dzBlockLen - tailSteps)
-}
-
func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
- const (
- r13 = 1 << iota // if R13 is already zeroed.
- )
-
- if cnt == 0 {
- return p
+ if cnt%8 != 0 {
+ panic("zeroed region not aligned")
}
-
- if cnt == 8 {
+ for cnt >= 16 {
+ p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
+ off += 16
+ cnt -= 16
+ }
+ if cnt != 0 {
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
- } else if cnt <= int64(8*types.RegSize) {
- for i := int64(0); i < cnt/16; i++ {
- p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16)
- }
-
- if cnt%16 != 0 {
- p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16))
- }
- } else if cnt <= int64(128*types.RegSize) {
- // Save DI to r12. With the amd64 Go register abi, DI can contain
- // an incoming parameter, whereas R12 is always scratch.
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
- // Emit duffzero call
- p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
- p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
- p.To.Sym = ir.Syms.Duffzero
- if cnt%16 != 0 {
- p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
- }
- // Restore DI from r12
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
-
- } else {
- // When the register ABI is in effect, at this point in the
- // prolog we may have live values in all of RAX,RDI,RCX. Save
- // them off to registers before the REPSTOSQ below, then
- // restore. Note that R12 and R13 are always available as
- // scratch regs; here we also use R15 (this is safe to do
- // since there won't be any globals accessed in the prolog).
- // See rewriteToUseGot() in obj6.go for more on r15 use.
-
- // Save rax/rdi/rcx
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0)
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0)
-
- // Set up the REPSTOSQ and kick it off.
- p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0)
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0)
- p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0)
- p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
- p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
-
- // Restore rax/rdi/rcx
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0)
- p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0)
-
- // Record the fact that r13 is no longer zero.
- *state &= ^uint32(r13)
}
-
return p
}