From d5dea86993e1bc07bb9a49d2930655050da006d7 Mon Sep 17 00:00:00 2001
From: David Chase <drchase@google.com>
Date: Thu, 7 Aug 2025 16:44:50 -0400
Subject: [PATCH] [dev.simd] cmd/compile: fix isIntrinsic for methods; fix fp
 <-> gp moves

also includes a handy debugging hook for the inliner.

Change-Id: I23d0619506219d21db78c6c801612ff058562142
Reviewed-on: https://go-review.googlesource.com/c/go/+/694118
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
---
 src/cmd/compile/internal/amd64/ssa.go         | 84 +++++++++++++------
 src/cmd/compile/internal/inline/inl.go        | 36 +++++++-
 src/cmd/compile/internal/ssagen/intrinsics.go |  7 ++
 3 files changed, 97 insertions(+), 30 deletions(-)

diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index d3fae7ce14..38815929d2 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -43,6 +43,10 @@ func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
 	}
 }
 
+func isFPReg(r int16) bool {
+	return x86.REG_X0 <= r && r <= x86.REG_Z31
+}
+
 // loadByType returns the load instruction of the given type.
 func loadByType(t *types.Type) obj.As {
 	// Avoid partial register write
@@ -88,31 +92,33 @@ func storeByType(t *types.Type) obj.As {
 }
 
 // moveByType returns the reg->reg move instruction of the given type.
-func moveByType(t *types.Type) obj.As {
-	if t.IsFloat() {
+func moveByType(from, to *ssa.Value) obj.As {
+	toT := to.Type
+	fromR, toR := from.Reg(), to.Reg()
+	if isFPReg(fromR) && isFPReg(toR) && toT.IsFloat() {
 		// Moving the whole sse2 register is faster
 		// than moving just the correct low portion of it.
 		// There is no xmm->xmm move with 1 byte opcode,
 		// so use movups, which has 2 byte opcode.
 		return x86.AMOVUPS
-	} else if t.IsSIMD() {
-		return simdMov(t.Size())
-	} else {
-		switch t.Size() {
-		case 1:
-			// Avoids partial register write
-			return x86.AMOVL
-		case 2:
-			return x86.AMOVL
-		case 4:
-			return x86.AMOVL
-		case 8:
-			return x86.AMOVQ
-		case 16:
-			return x86.AMOVUPS // int128s are in SSE registers
-		default:
-			panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
-		}
+	}
+	if toT.IsSIMD() {
+		return simdMov(toT.Size())
+	}
+	switch toT.Size() {
+	case 1:
+		// Avoids partial register write
+		return x86.AMOVL
+	case 2:
+		return x86.AMOVL
+	case 4:
+		return x86.AMOVL
+	case 8:
+		return x86.AMOVQ
+	case 16:
+		return x86.AMOVUPS // int128s are in SSE registers
+	default:
+		panic(fmt.Sprintf("bad int register width %d:%v", toT.Size(), toT))
 	}
 }
 
@@ -648,7 +654,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		// But this requires a way for regalloc to know that SRC might be
 		// clobbered by this instruction.
 		t := v.RegTmp()
-		opregreg(s, moveByType(v.Type), t, v.Args[1].Reg())
+		opregreg(s, moveByType(v.Args[1], v), t, v.Args[1].Reg())
 
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
@@ -820,13 +826,37 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		p.From.Offset = v.AuxInt
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = x
+
 	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
 		x := v.Reg()
-		p := s.Prog(v.Op.Asm())
-		p.From.Type = obj.TYPE_FCONST
-		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = x
+		a := v.Op.Asm()
+		if x < x86.REG_X0 { // not an FP register
+			if v.AuxInt == 0 && v.Aux == nil {
+				opregreg(s, x86.AXORL, x, x)
+				break
+			}
+			c := v.AuxInt
+			switch v.Type.Size() {
+			case 4:
+				a = x86.AMOVL
+				c = int64(math.Float32bits(float32(math.Float64frombits(uint64(v.AuxInt)))))
+			case 8:
+				a = x86.AMOVQ
+			default:
+				panic(fmt.Sprintf("unexpected type width for float const into non-float register, %v", v))
+			}
+			p := s.Prog(a)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = c
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = x
+		} else {
+			p := s.Prog(a)
+			p.From.Type = obj.TYPE_FCONST
+			p.From.Val = math.Float64frombits(uint64(v.AuxInt))
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = x
+		}
 	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
 		ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
 		ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
@@ -1134,7 +1164,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 			y = simdOrMaskReg(v)
 		}
 		if x != y {
-			opregreg(s, moveByType(v.Type), y, x)
+			opregreg(s, moveByType(v.Args[0], v), y, x)
 		}
 	case ssa.OpLoadReg:
 		if v.Type.IsFlags() {
diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
index c06f76fe9f..1ba8350803 100644
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@@ -202,6 +202,7 @@ func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose boo
 		// be very liberal here, if the closure is only called once, the budget is large
 		budget = max(budget, inlineClosureCalledOnceCost)
 	}
+
 	return budget
 }
 
@@ -263,6 +264,7 @@ func CanInline(fn *ir.Func, profile *pgoir.Profile) {
 
 	visitor := hairyVisitor{
 		curFunc:       fn,
+		debug:         isDebugFn(fn),
 		isBigFunc:     IsBigFunc(fn),
 		budget:        budget,
 		maxBudget:     budget,
@@ -407,6 +409,7 @@ type hairyVisitor struct {
 	// This is needed to access the current caller in the doNode function.
 	curFunc       *ir.Func
 	isBigFunc     bool
+	debug         bool
 	budget        int32
 	maxBudget     int32
 	reason        string
@@ -416,6 +419,16 @@ type hairyVisitor struct {
 	profile       *pgoir.Profile
 }
 
+func isDebugFn(fn *ir.Func) bool {
+	// if n := fn.Nname; n != nil && n.Sym().Pkg.Path == "0" {
+	// 	if n.Sym().Name == "BroadcastInt64x4" {
+	// 		fmt.Printf("isDebugFn '%s' DOT '%s'\n", n.Sym().Pkg.Path, n.Sym().Name)
+	// 		return true
+	// 	}
+	// }
+	return false
+}
+
 func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
 	v.do = v.doNode // cache closure
 	if ir.DoChildren(fn, v.do) {
@@ -434,6 +447,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 	if n == nil {
 		return false
 	}
+	if v.debug {
+		fmt.Printf("%v: doNode %v budget is %d\n", ir.Line(n), n.Op(), v.budget)
+	}
 opSwitch:
 	switch n.Op() {
 	// Call is okay if inlinable and we have the budget for the body.
@@ -551,12 +567,19 @@ opSwitch:
 		}
 
 		if cheap {
+			if v.debug {
+				if ir.IsIntrinsicCall(n) {
+					fmt.Printf("%v: cheap call is also intrinsic, %v\n", ir.Line(n), n)
+				}
+			}
 			break // treat like any other node, that is, cost of 1
 		}
 
 		if ir.IsIntrinsicCall(n) {
-			// Treat like any other node.
-			break
+			if v.debug {
+				fmt.Printf("%v: intrinsic call, %v\n", ir.Line(n), n)
+			}
+			break // Treat like any other node.
 		}
 
 		if callee := inlCallee(v.curFunc, n.Fun, v.profile, false); callee != nil && typecheck.HaveInlineBody(callee) {
@@ -583,6 +606,10 @@ opSwitch:
 			}
 		}
 
+		if v.debug {
+			fmt.Printf("%v: costly OCALLFUNC %v\n", ir.Line(n), n)
+		}
+
 		// Call cost for non-leaf inlining.
 		v.budget -= extraCost
 
@@ -592,6 +619,9 @@ opSwitch:
 	// Things that are too hairy, irrespective of the budget
 	case ir.OCALL, ir.OCALLINTER:
 		// Call cost for non-leaf inlining.
+		if v.debug {
+			fmt.Printf("%v: costly OCALL %v\n", ir.Line(n), n)
+		}
 		v.budget -= v.extraCallCost
 
 	case ir.OPANIC:
@@ -743,7 +773,7 @@ opSwitch:
 	v.budget--
 
 	// When debugging, don't stop early, to get full cost of inlining this function
-	if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() {
+	if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() && !v.debug {
 		v.reason = "too expensive"
 		return true
 	}
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
index ee03075f52..f5b5b9bb7c 100644
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1913,6 +1913,13 @@ func IsIntrinsicCall(n *ir.CallExpr) bool {
 	}
 	name, ok := n.Fun.(*ir.Name)
 	if !ok {
+		if n.Fun.Op() == ir.OMETHEXPR {
+			if meth := ir.MethodExprName(n.Fun); meth != nil {
+				if fn := meth.Func; fn != nil {
+					return IsIntrinsicSym(fn.Sym())
+				}
+			}
+		}
 		return false
 	}
 	return IsIntrinsicSym(name.Sym())
-- 
2.52.0