From 7380213a4eca31fb0da3b164a129eb5fd699d796 Mon Sep 17 00:00:00 2001
From: Junyang Shao <shaojunyang@google.com>
Date: Thu, 14 Aug 2025 20:21:37 +0000
Subject: [PATCH] [dev.simd] cmd/compile: make move/load/store dependent only
 on reg and width

This CL improve its previous CL by implementing
move/load/storeByRegWidth. It should have not touched the compilation
path of complex128, but as a side effect, the move/load/store of
16-byte SIMD vectors in X0 to X15 are now compiled to MOVUPS instead of
VMOVDQU.

These functions could be used in MOV*const, but this CL does not do that
because we haven't seen problems of them yet. But in the future if we
see problems calling these functions to find the right asm might be handy.

Change-Id: I9b76e65eef8155479d3e288402aa96bc29a4f7cb
Reviewed-on: https://go-review.googlesource.com/c/go/+/696255
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
---
 src/cmd/compile/internal/amd64/ssa.go | 108 ++++++++++++++++----------
 1 file changed, 67 insertions(+), 41 deletions(-)

diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 56d0ab2867..3ae3c61764 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -47,11 +47,19 @@ func isFPReg(r int16) bool {
 	return x86.REG_X0 <= r && r <= x86.REG_Z31
 }
 
-// loadByTypeAndReg returns the load instruction of the given type/register.
-func loadByTypeAndReg(t *types.Type, r int16) obj.As {
-	// Avoid partial register write
-	if !t.IsFloat() {
-		switch t.Size() {
+func isKReg(r int16) bool {
+	return x86.REG_K0 <= r && r <= x86.REG_K7
+}
+
+func isLowFPReg(r int16) bool {
+	return x86.REG_X0 <= r && r <= x86.REG_X15
+}
+
+// loadByRegWidth returns the load instruction of the given register of a given width.
+func loadByRegWidth(r int16, width int64) obj.As {
+	// Avoid partial register write for GPR
+	if !isFPReg(r) && !isKReg(r) {
+		switch width {
 		case 1:
 			return x86.AMOVBLZX
 		case 2:
@@ -59,24 +67,35 @@ func loadByTypeAndReg(t *types.Type, r int16) obj.As {
 		}
 	}
 	// Otherwise, there's no difference between load and store opcodes.
-	return storeByTypeAndReg(t, r)
+	return storeByRegWidth(r, width)
 }
 
-// storeByTypeAndReg returns the store instruction of the given type/register.
+// storeByRegWidth returns the store instruction of the given register of a given width.
 // It's also used for loading const to a reg.
-func storeByTypeAndReg(t *types.Type, r int16) obj.As {
-	width := t.Size()
-	if t.IsSIMD() {
-		return simdMov(width)
-	}
+func storeByRegWidth(r int16, width int64) obj.As {
 	if isFPReg(r) {
 		switch width {
 		case 4:
 			return x86.AMOVSS
 		case 8:
 			return x86.AMOVSD
+		case 16:
+			// int128s are in SSE registers
+			if isLowFPReg(r) {
+				return x86.AMOVUPS
+			} else {
+				return x86.AVMOVDQU
+			}
+		case 32:
+			return x86.AVMOVDQU
+		case 64:
+			return x86.AVMOVDQU64
 		}
 	}
+	if isKReg(r) {
+		return x86.AKMOVQ
+	}
+	// gp
 	switch width {
 	case 1:
 		return x86.AMOVB
@@ -86,25 +105,32 @@ func storeByTypeAndReg(t *types.Type, r int16) obj.As {
 		return x86.AMOVL
 	case 8:
 		return x86.AMOVQ
-	case 16:
-		return x86.AMOVUPS
 	}
-	panic(fmt.Sprintf("bad store type %v", t))
+	panic(fmt.Sprintf("bad store reg=%v, width=%d", r, width))
 }
 
-// moveByTypeAndReg returns the reg->reg move instruction of the given type/registers.
-func moveByTypeAndReg(t *types.Type, dest, src int16) obj.As {
-	width := t.Size()
-	if t.IsSIMD() {
-		return simdMov(t.Size())
-	}
+// moveByRegsWidth returns the reg->reg move instruction of the given dest/src registers of a given width.
+func moveByRegsWidth(dest, src int16, width int64) obj.As {
 	// fp -> fp
 	if isFPReg(dest) && isFPReg(src) {
 		// Moving the whole sse2 register is faster
 		// than moving just the correct low portion of it.
 		// There is no xmm->xmm move with 1 byte opcode,
 		// so use movups, which has 2 byte opcode.
-		return x86.AMOVUPS
+		if isLowFPReg(dest) && isLowFPReg(src) && width <= 16 {
+			return x86.AMOVUPS
+		}
+		if width <= 32 {
+			return x86.AVMOVDQU
+		}
+		return x86.AVMOVDQU64
+	}
+	// k -> gp, gp -> k, k -> k
+	if isKReg(dest) || isKReg(src) {
+		if isFPReg(dest) || isFPReg(src) {
+			panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
+		}
+		return x86.AKMOVQ
 	}
 	// gp -> fp, fp -> gp, gp -> gp
 	switch width {
@@ -118,9 +144,18 @@ func moveByTypeAndReg(t *types.Type, dest, src int16) obj.As {
 	case 8:
 		return x86.AMOVQ
 	case 16:
-		return x86.AMOVUPS // int128s are in SSE registers
+		if isLowFPReg(dest) && isLowFPReg(src) {
+			// int128s are in SSE registers
+			return x86.AMOVUPS
+		} else {
+			return x86.AVMOVDQU
+		}
+	case 32:
+		return x86.AVMOVDQU
+	case 64:
+		return x86.AVMOVDQU64
 	}
-	panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
+	panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
 }
 
 // opregreg emits instructions for
@@ -616,7 +651,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		// But this requires a way for regalloc to know that SRC might be
 		// clobbered by this instruction.
 		t := v.RegTmp()
-		opregreg(s, moveByTypeAndReg(v.Type, t, v.Args[1].Reg()), t, v.Args[1].Reg())
+		opregreg(s, moveByRegsWidth(t, v.Args[1].Reg(), v.Type.Size()), t, v.Args[1].Reg())
 
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
@@ -795,7 +830,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 			opregreg(s, x86.AXORL, x, x)
 			break
 		}
-		p := s.Prog(storeByTypeAndReg(v.Type, x))
+		p := s.Prog(storeByRegWidth(x, v.Type.Size()))
 		p.From.Type = obj.TYPE_FCONST
 		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
 		p.To.Type = obj.TYPE_REG
@@ -1197,7 +1232,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 			y = simdOrMaskReg(v)
 		}
 		if x != y {
-			opregreg(s, moveByTypeAndReg(v.Type, y, x), y, x)
+			opregreg(s, moveByRegsWidth(y, x, v.Type.Size()), y, x)
 		}
 	case ssa.OpLoadReg:
 		if v.Type.IsFlags() {
@@ -1205,7 +1240,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 			return
 		}
 		r := v.Reg()
-		p := s.Prog(loadByTypeAndReg(v.Type, r))
+		p := s.Prog(loadByRegWidth(r, v.Type.Size()))
 		ssagen.AddrAuto(&p.From, v.Args[0])
 		p.To.Type = obj.TYPE_REG
 		if v.Type.IsSIMD() {
@@ -1222,7 +1257,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		if v.Type.IsSIMD() {
 			r = simdOrMaskReg(v.Args[0])
 		}
-		p := s.Prog(storeByTypeAndReg(v.Type, r))
+		p := s.Prog(storeByRegWidth(r, v.Type.Size()))
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = r
 		ssagen.AddrAuto(&p.To, v)
@@ -1239,7 +1274,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 			// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
 			addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
 			s.FuncInfo().AddSpill(
-				obj.RegSpill{Reg: ap.Reg, Addr: addr, Unspill: loadByTypeAndReg(ap.Type, ap.Reg), Spill: storeByTypeAndReg(ap.Type, ap.Reg)})
+				obj.RegSpill{Reg: ap.Reg, Addr: addr, Unspill: loadByRegWidth(ap.Reg, ap.Type.Size()), Spill: storeByRegWidth(ap.Reg, ap.Type.Size())})
 		}
 		v.Block.Func.RegArgs = nil
 		ssagen.CheckArgReg(v)
@@ -2123,7 +2158,7 @@ func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
 }
 
 func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
-	p := s.Prog(loadByTypeAndReg(t, reg))
+	p := s.Prog(loadByRegWidth(reg, t.Size()))
 	p.From.Type = obj.TYPE_MEM
 	p.From.Name = obj.NAME_AUTO
 	p.From.Sym = n.Linksym()
@@ -2134,7 +2169,7 @@ func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir
 }
 
 func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
-	p = pp.Append(p, storeByTypeAndReg(t, reg), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
+	p = pp.Append(p, storeByRegWidth(reg, t.Size()), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
 	p.To.Name = obj.NAME_PARAM
 	p.To.Sym = n.Linksym()
 	p.Pos = p.Pos.WithNotStmt()
@@ -2220,12 +2255,3 @@ func simdCheckRegOnly(v *ssa.Value, regStart, regEnd int16) int16 {
 	}
 	return v.Reg()
 }
-
-func simdMov(width int64) obj.As {
-	if width >= 64 {
-		return x86.AVMOVDQU64
-	} else if width >= 16 {
-		return x86.AVMOVDQU
-	}
-	return x86.AKMOVQ
-}
-- 
2.52.0