From: Keith Randall <khr@golang.org>
Date: Tue, 3 Jun 2025 19:36:35 +0000 (-0700)
Subject: cmd/compile: simplify zerorange on amd64
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=b10eb1d042cb10031ad6d1b61bf7509501d62c81;p=gostls13.git

cmd/compile: simplify zerorange on amd64

Get rid of duffzero and large zeroing cases. We only use this code
for small things now.

Change-Id: Idcf330d0ac6433448efa8e32be7eb7f988e10122
Reviewed-on: https://go-review.googlesource.com/c/go/+/678619
Reviewed-by: Jorropo <jorropo.pgm@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@google.com>
---

diff --git a/src/cmd/compile/internal/amd64/ggen.go b/src/cmd/compile/internal/amd64/ggen.go
index 1dc952a455..853a10cb9a 100644
--- a/src/cmd/compile/internal/amd64/ggen.go
+++ b/src/cmd/compile/internal/amd64/ggen.go
@@ -5,113 +5,23 @@
 package amd64
 
 import (
-	"cmd/compile/internal/ir"
 	"cmd/compile/internal/objw"
-	"cmd/compile/internal/types"
 	"cmd/internal/obj"
 	"cmd/internal/obj/x86"
 )
 
-// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
-// See runtime/mkduff.go.
-const (
-	dzBlocks    = 16 // number of MOV/ADD blocks
-	dzBlockLen  = 4  // number of clears per block
-	dzBlockSize = 23 // size of instructions in a single block
-	dzMovSize   = 5  // size of single MOV instruction w/ offset
-	dzLeaqSize  = 4  // size of single LEAQ instruction
-	dzClearStep = 16 // number of bytes cleared by each MOV instruction
-
-	dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
-	dzSize     = dzBlocks * dzBlockSize
-)
-
-// dzOff returns the offset for a jump into DUFFZERO.
-// b is the number of bytes to zero.
-func dzOff(b int64) int64 {
-	off := int64(dzSize)
-	off -= b / dzClearLen * dzBlockSize
-	tailLen := b % dzClearLen
-	if tailLen >= dzClearStep {
-		off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep)
-	}
-	return off
-}
-
-// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
-// b is the number of bytes to zero.
-func dzDI(b int64) int64 {
-	tailLen := b % dzClearLen
-	if tailLen < dzClearStep {
-		return 0
-	}
-	tailSteps := tailLen / dzClearStep
-	return -dzClearStep * (dzBlockLen - tailSteps)
-}
-
 func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
-	const (
-		r13 = 1 << iota // if R13 is already zeroed.
-	)
-
-	if cnt == 0 {
-		return p
+	if cnt%8 != 0 {
+		panic("zeroed region not aligned")
 	}
-
-	if cnt == 8 {
+	for cnt >= 16 {
+		p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
+		off += 16
+		cnt -= 16
+	}
+	if cnt != 0 {
 		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
-	} else if cnt <= int64(8*types.RegSize) {
-		for i := int64(0); i < cnt/16; i++ {
-			p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16)
-		}
-
-		if cnt%16 != 0 {
-			p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16))
-		}
-	} else if cnt <= int64(128*types.RegSize) {
-		// Save DI to r12. With the amd64 Go register abi, DI can contain
-		// an incoming parameter, whereas R12 is always scratch.
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
-		// Emit duffzero call
-		p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
-		p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
-		p.To.Sym = ir.Syms.Duffzero
-		if cnt%16 != 0 {
-			p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
-		}
-		// Restore DI from r12
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
-
-	} else {
-		// When the register ABI is in effect, at this point in the
-		// prolog we may have live values in all of RAX,RDI,RCX. Save
-		// them off to registers before the REPSTOSQ below, then
-		// restore. Note that R12 and R13 are always available as
-		// scratch regs; here we also use R15 (this is safe to do
-		// since there won't be any globals accessed in the prolog).
-		// See rewriteToUseGot() in obj6.go for more on r15 use.
-
-		// Save rax/rdi/rcx
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0)
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0)
-
-		// Set up the REPSTOSQ and kick it off.
-		p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0)
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0)
-		p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0)
-		p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
-		p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
-
-		// Restore rax/rdi/rcx
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0)
-		p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0)
-
-		// Record the fact that r13 is no longer zero.
-		*state &= ^uint32(r13)
 	}
-
 	return p
 }
 
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 9fce5cfc31..c50f187dc8 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -144,6 +144,15 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
 
 // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
 // See runtime/mkduff.go.
+const (
+	dzBlocks    = 16 // number of MOV/ADD blocks
+	dzBlockLen  = 4  // number of clears per block
+	dzBlockSize = 23 // size of instructions in a single block
+	dzMovSize   = 5  // size of single MOV instruction w/ offset
+	dzLeaqSize  = 4  // size of single LEAQ instruction
+	dzClearStep = 16 // number of bytes cleared by each MOV instruction
+)
+
 func duffStart(size int64) int64 {
 	x, _ := duff(size)
 	return x
diff --git a/src/cmd/compile/internal/liveness/plive.go b/src/cmd/compile/internal/liveness/plive.go
index 5a2a22ee8f..7e724397dc 100644
--- a/src/cmd/compile/internal/liveness/plive.go
+++ b/src/cmd/compile/internal/liveness/plive.go
@@ -769,7 +769,7 @@ func (lv *Liveness) epilogue() {
 					// its stack copy is not live.
 					continue
 				}
-				// Note: zeroing is handled by zeroResults in walk.go.
+				// Note: zeroing is handled by zeroResults in ../ssagen/ssa.go.
 				livedefer.Set(int32(i))
 			}
 			if n.IsOutputParamHeapAddr() {
diff --git a/src/cmd/compile/internal/ssagen/arch.go b/src/cmd/compile/internal/ssagen/arch.go
index 483e45cad4..ef5d8f59d7 100644
--- a/src/cmd/compile/internal/ssagen/arch.go
+++ b/src/cmd/compile/internal/ssagen/arch.go
@@ -25,8 +25,13 @@ type ArchInfo struct {
 
 	PadFrame func(int64) int64
 
-	// ZeroRange zeroes a range of memory on stack. It is only inserted
-	// at function entry, and it is ok to clobber registers.
+	// ZeroRange zeroes a range of memory the on stack.
+	//  - it is only called at function entry
+	//  - it is ok to clobber (non-arg) registers.
+	//  - currently used only for small things, so it can be simple.
+	//    - pointers to heap-allocated return values
+	//    - open-coded deferred functions
+	// (Max size in make.bash is 40 bytes.)
 	ZeroRange func(*objw.Progs, *obj.Prog, int64, int64, *uint32) *obj.Prog
 
 	Ginsnop func(*objw.Progs) *obj.Prog