From: Keith Randall Date: Tue, 3 Jun 2025 19:36:35 +0000 (-0700) Subject: cmd/compile: simplify zerorange on amd64 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=b10eb1d042cb10031ad6d1b61bf7509501d62c81;p=gostls13.git cmd/compile: simplify zerorange on amd64 Get rid of duffzero and large zeroing cases. We only use this code for small things now. Change-Id: Idcf330d0ac6433448efa8e32be7eb7f988e10122 Reviewed-on: https://go-review.googlesource.com/c/go/+/678619 Reviewed-by: Jorropo LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Knyszek Reviewed-by: Keith Randall --- diff --git a/src/cmd/compile/internal/amd64/ggen.go b/src/cmd/compile/internal/amd64/ggen.go index 1dc952a455..853a10cb9a 100644 --- a/src/cmd/compile/internal/amd64/ggen.go +++ b/src/cmd/compile/internal/amd64/ggen.go @@ -5,113 +5,23 @@ package amd64 import ( - "cmd/compile/internal/ir" "cmd/compile/internal/objw" - "cmd/compile/internal/types" "cmd/internal/obj" "cmd/internal/obj/x86" ) -// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, -// See runtime/mkduff.go. -const ( - dzBlocks = 16 // number of MOV/ADD blocks - dzBlockLen = 4 // number of clears per block - dzBlockSize = 23 // size of instructions in a single block - dzMovSize = 5 // size of single MOV instruction w/ offset - dzLeaqSize = 4 // size of single LEAQ instruction - dzClearStep = 16 // number of bytes cleared by each MOV instruction - - dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block - dzSize = dzBlocks * dzBlockSize -) - -// dzOff returns the offset for a jump into DUFFZERO. -// b is the number of bytes to zero. -func dzOff(b int64) int64 { - off := int64(dzSize) - off -= b / dzClearLen * dzBlockSize - tailLen := b % dzClearLen - if tailLen >= dzClearStep { - off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep) - } - return off -} - -// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO. -// b is the number of bytes to zero. -func dzDI(b int64) int64 { - tailLen := b % dzClearLen - if tailLen < dzClearStep { - return 0 - } - tailSteps := tailLen / dzClearStep - return -dzClearStep * (dzBlockLen - tailSteps) -} - func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog { - const ( - r13 = 1 << iota // if R13 is already zeroed. - ) - - if cnt == 0 { - return p + if cnt%8 != 0 { + panic("zeroed region not aligned") } - - if cnt == 8 { + for cnt >= 16 { + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off) + off += 16 + cnt -= 16 + } + if cnt != 0 { p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off) - } else if cnt <= int64(8*types.RegSize) { - for i := int64(0); i < cnt/16; i++ { - p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16) - } - - if cnt%16 != 0 { - p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16)) - } - } else if cnt <= int64(128*types.RegSize) { - // Save DI to r12. With the amd64 Go register abi, DI can contain - // an incoming parameter, whereas R12 is always scratch. - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) - // Emit duffzero call - p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0) - p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt)) - p.To.Sym = ir.Syms.Duffzero - if cnt%16 != 0 { - p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8)) - } - // Restore DI from r12 - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) - - } else { - // When the register ABI is in effect, at this point in the - // prolog we may have live values in all of RAX,RDI,RCX. Save - // them off to registers before the REPSTOSQ below, then - // restore. Note that R12 and R13 are always available as - // scratch regs; here we also use R15 (this is safe to do - // since there won't be any globals accessed in the prolog). - // See rewriteToUseGot() in obj6.go for more on r15 use. - - // Save rax/rdi/rcx - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0) - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0) - - // Set up the REPSTOSQ and kick it off. - p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0) - p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0) - p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0) - p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) - p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) - - // Restore rax/rdi/rcx - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0) - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0) - - // Record the fact that r13 is no longer zero. - *state &= ^uint32(r13) } - return p } diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 9fce5cfc31..c50f187dc8 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -144,6 +144,15 @@ func memIdx(a *obj.Addr, v *ssa.Value) { // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, // See runtime/mkduff.go. +const ( + dzBlocks = 16 // number of MOV/ADD blocks + dzBlockLen = 4 // number of clears per block + dzBlockSize = 23 // size of instructions in a single block + dzMovSize = 5 // size of single MOV instruction w/ offset + dzLeaqSize = 4 // size of single LEAQ instruction + dzClearStep = 16 // number of bytes cleared by each MOV instruction +) + func duffStart(size int64) int64 { x, _ := duff(size) return x diff --git a/src/cmd/compile/internal/liveness/plive.go b/src/cmd/compile/internal/liveness/plive.go index 5a2a22ee8f..7e724397dc 100644 --- a/src/cmd/compile/internal/liveness/plive.go +++ b/src/cmd/compile/internal/liveness/plive.go @@ -769,7 +769,7 @@ func (lv *Liveness) epilogue() { // its stack copy is not live. continue } - // Note: zeroing is handled by zeroResults in walk.go. + // Note: zeroing is handled by zeroResults in ../ssagen/ssa.go. livedefer.Set(int32(i)) } if n.IsOutputParamHeapAddr() { diff --git a/src/cmd/compile/internal/ssagen/arch.go b/src/cmd/compile/internal/ssagen/arch.go index 483e45cad4..ef5d8f59d7 100644 --- a/src/cmd/compile/internal/ssagen/arch.go +++ b/src/cmd/compile/internal/ssagen/arch.go @@ -25,8 +25,13 @@ type ArchInfo struct { PadFrame func(int64) int64 - // ZeroRange zeroes a range of memory on stack. It is only inserted - // at function entry, and it is ok to clobber registers. + // ZeroRange zeroes a range of memory the on stack. + // - it is only called at function entry + // - it is ok to clobber (non-arg) registers. + // - currently used only for small things, so it can be simple. + // - pointers to heap-allocated return values + // - open-coded deferred functions + // (Max size in make.bash is 40 bytes.) ZeroRange func(*objw.Progs, *obj.Prog, int64, int64, *uint32) *obj.Prog Ginsnop func(*objw.Progs) *obj.Prog