cmd/compile: simplify zerorange on amd64

author Keith Randall <khr@golang.org>

Tue, 3 Jun 2025 19:36:35 +0000 (12:36 -0700)

committer Keith Randall <khr@golang.org>

Tue, 29 Jul 2025 00:39:26 +0000 (17:39 -0700)
author Keith Randall <khr@golang.org>
Tue, 3 Jun 2025 19:36:35 +0000 (12:36 -0700)
committer Keith Randall <khr@golang.org>
Tue, 29 Jul 2025 00:39:26 +0000 (17:39 -0700)
diff --git a/src/cmd/compile/internal/amd64/ggen.go b/src/cmd/compile/internal/amd64/ggen.go

index 1dc952a4557c12dd13dfb4787121ae3b32543cc3..853a10cb9a396aff8a3fabb7e438589ec15e876a 100644 (file)
--- a/src/cmd/compile/internal/amd64/ggen.go
+++ b/src/cmd/compile/internal/amd64/ggen.go
@@ -5,113 +5,23 @@
  package amd64
  
  import (
-       "cmd/compile/internal/ir"
         "cmd/compile/internal/objw"
-       "cmd/compile/internal/types"
         "cmd/internal/obj"
         "cmd/internal/obj/x86"
  )
  
-// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
-// See runtime/mkduff.go.
-const (
-       dzBlocks    = 16 // number of MOV/ADD blocks
-       dzBlockLen  = 4  // number of clears per block
-       dzBlockSize = 23 // size of instructions in a single block
-       dzMovSize   = 5  // size of single MOV instruction w/ offset
-       dzLeaqSize  = 4  // size of single LEAQ instruction
-       dzClearStep = 16 // number of bytes cleared by each MOV instruction
-
-       dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
-       dzSize     = dzBlocks * dzBlockSize
-)
-
-// dzOff returns the offset for a jump into DUFFZERO.
-// b is the number of bytes to zero.
-func dzOff(b int64) int64 {
-       off := int64(dzSize)
-       off -= b / dzClearLen * dzBlockSize
-       tailLen := b % dzClearLen
-       if tailLen >= dzClearStep {
-               off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep)
-       }
-       return off
-}
-
-// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
-// b is the number of bytes to zero.
-func dzDI(b int64) int64 {
-       tailLen := b % dzClearLen
-       if tailLen < dzClearStep {
-               return 0
-       }
-       tailSteps := tailLen / dzClearStep
-       return -dzClearStep * (dzBlockLen - tailSteps)
-}
-
  func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
-       const (
-               r13 = 1 << iota // if R13 is already zeroed.
-       )
-
-       if cnt == 0 {
-               return p
+       if cnt%8 != 0 {
+               panic("zeroed region not aligned")
         }
-
-       if cnt == 8 {
+       for cnt >= 16 {
+               p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
+               off += 16
+               cnt -= 16
+       }
+       if cnt != 0 {
                 p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
-       } else if cnt <= int64(8*types.RegSize) {
-               for i := int64(0); i < cnt/16; i++ {
-                       p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16)
-               }
-
-               if cnt%16 != 0 {
-                       p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16))
-               }
-       } else if cnt <= int64(128*types.RegSize) {
-               // Save DI to r12. With the amd64 Go register abi, DI can contain
-               // an incoming parameter, whereas R12 is always scratch.
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
-               // Emit duffzero call
-               p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
-               p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
-               p.To.Sym = ir.Syms.Duffzero
-               if cnt%16 != 0 {
-                       p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
-               }
-               // Restore DI from r12
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
-
-       } else {
-               // When the register ABI is in effect, at this point in the
-               // prolog we may have live values in all of RAX,RDI,RCX. Save
-               // them off to registers before the REPSTOSQ below, then
-               // restore. Note that R12 and R13 are always available as
-               // scratch regs; here we also use R15 (this is safe to do
-               // since there won't be any globals accessed in the prolog).
-               // See rewriteToUseGot() in obj6.go for more on r15 use.
-
-               // Save rax/rdi/rcx
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0)
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0)
-
-               // Set up the REPSTOSQ and kick it off.
-               p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0)
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0)
-               p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0)
-               p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
-               p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
-
-               // Restore rax/rdi/rcx
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0)
-               p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0)
-
-               // Record the fact that r13 is no longer zero.
-               *state &= ^uint32(r13)
         }
-
         return p
  }
  
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go

index 9fce5cfc31d4f8e225b4ffe557af905174f1750d..c50f187dc8613a886fe7b60d6cac1f38f89cceae 100644 (file)
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -144,6 +144,15 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
  
  // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
  // See runtime/mkduff.go.
+const (
+       dzBlocks    = 16 // number of MOV/ADD blocks
+       dzBlockLen  = 4  // number of clears per block
+       dzBlockSize = 23 // size of instructions in a single block
+       dzMovSize   = 5  // size of single MOV instruction w/ offset
+       dzLeaqSize  = 4  // size of single LEAQ instruction
+       dzClearStep = 16 // number of bytes cleared by each MOV instruction
+)
+
  func duffStart(size int64) int64 {
         x, _ := duff(size)
         return x
diff --git a/src/cmd/compile/internal/liveness/plive.go b/src/cmd/compile/internal/liveness/plive.go

index 5a2a22ee8f5c82fe8a12d013ae5f0657a4bc83d7..7e724397dcfd6278d2b5254e396c803cdf59b34f 100644 (file)
--- a/src/cmd/compile/internal/liveness/plive.go
+++ b/src/cmd/compile/internal/liveness/plive.go
@@ -769,7 +769,7 @@ func (lv *Liveness) epilogue() {
                                         // its stack copy is not live.
                                         continue
                                 }
-                               // Note: zeroing is handled by zeroResults in walk.go.
+                               // Note: zeroing is handled by zeroResults in ../ssagen/ssa.go.
                                 livedefer.Set(int32(i))
                         }
                         if n.IsOutputParamHeapAddr() {
diff --git a/src/cmd/compile/internal/ssagen/arch.go b/src/cmd/compile/internal/ssagen/arch.go

index 483e45cad43c7445c46032111434fffe16cd62a7..ef5d8f59d7168a8bc0c3c9f62d1fd4cf471f6089 100644 (file)
--- a/src/cmd/compile/internal/ssagen/arch.go
+++ b/src/cmd/compile/internal/ssagen/arch.go
@@ -25,8 +25,13 @@ type ArchInfo struct {
  
         PadFrame func(int64) int64
  
-       // ZeroRange zeroes a range of memory on stack. It is only inserted
-       // at function entry, and it is ok to clobber registers.
+       // ZeroRange zeroes a range of memory the on stack.
+       //  - it is only called at function entry
+       //  - it is ok to clobber (non-arg) registers.
+       //  - currently used only for small things, so it can be simple.
+       //    - pointers to heap-allocated return values
+       //    - open-coded deferred functions
+       // (Max size in make.bash is 40 bytes.)
         ZeroRange func(*objw.Progs, *obj.Prog, int64, int64, *uint32) *obj.Prog
  
         Ginsnop func(*objw.Progs) *obj.Prog
author	Keith Randall <khr@golang.org>
	Tue, 3 Jun 2025 19:36:35 +0000 (12:36 -0700)
committer	Keith Randall <khr@golang.org>
	Tue, 29 Jul 2025 00:39:26 +0000 (17:39 -0700)
src/cmd/compile/internal/amd64/ggen.go		patch \| blob \| history
src/cmd/compile/internal/amd64/ssa.go		patch \| blob \| history
src/cmd/compile/internal/liveness/plive.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/arch.go		patch \| blob \| history