From d5dea86993e1bc07bb9a49d2930655050da006d7 Mon Sep 17 00:00:00 2001 From: David Chase Date: Thu, 7 Aug 2025 16:44:50 -0400 Subject: [PATCH] [dev.simd] cmd/compile: fix isIntrinsic for methods; fix fp <-> gp moves also includes a handy debugging hook for the inliner. Change-Id: I23d0619506219d21db78c6c801612ff058562142 Reviewed-on: https://go-review.googlesource.com/c/go/+/694118 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- src/cmd/compile/internal/amd64/ssa.go | 84 +++++++++++++------ src/cmd/compile/internal/inline/inl.go | 36 +++++++- src/cmd/compile/internal/ssagen/intrinsics.go | 7 ++ 3 files changed, 97 insertions(+), 30 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index d3fae7ce14..38815929d2 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -43,6 +43,10 @@ func ssaMarkMoves(s *ssagen.State, b *ssa.Block) { } } +func isFPReg(r int16) bool { + return x86.REG_X0 <= r && r <= x86.REG_Z31 +} + // loadByType returns the load instruction of the given type. func loadByType(t *types.Type) obj.As { // Avoid partial register write @@ -88,31 +92,33 @@ func storeByType(t *types.Type) obj.As { } // moveByType returns the reg->reg move instruction of the given type. -func moveByType(t *types.Type) obj.As { - if t.IsFloat() { +func moveByType(from, to *ssa.Value) obj.As { + toT := to.Type + fromR, toR := from.Reg(), to.Reg() + if isFPReg(fromR) && isFPReg(toR) && toT.IsFloat() { // Moving the whole sse2 register is faster // than moving just the correct low portion of it. // There is no xmm->xmm move with 1 byte opcode, // so use movups, which has 2 byte opcode. return x86.AMOVUPS - } else if t.IsSIMD() { - return simdMov(t.Size()) - } else { - switch t.Size() { - case 1: - // Avoids partial register write - return x86.AMOVL - case 2: - return x86.AMOVL - case 4: - return x86.AMOVL - case 8: - return x86.AMOVQ - case 16: - return x86.AMOVUPS // int128s are in SSE registers - default: - panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t)) - } + } + if toT.IsSIMD() { + return simdMov(toT.Size()) + } + switch toT.Size() { + case 1: + // Avoids partial register write + return x86.AMOVL + case 2: + return x86.AMOVL + case 4: + return x86.AMOVL + case 8: + return x86.AMOVQ + case 16: + return x86.AMOVUPS // int128s are in SSE registers + default: + panic(fmt.Sprintf("bad int register width %d:%v", toT.Size(), toT)) } } @@ -648,7 +654,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // But this requires a way for regalloc to know that SRC might be // clobbered by this instruction. t := v.RegTmp() - opregreg(s, moveByType(v.Type), t, v.Args[1].Reg()) + opregreg(s, moveByType(v.Args[1], v), t, v.Args[1].Reg()) p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG @@ -820,13 +826,37 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = x + case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst: x := v.Reg() - p := s.Prog(v.Op.Asm()) - p.From.Type = obj.TYPE_FCONST - p.From.Val = math.Float64frombits(uint64(v.AuxInt)) - p.To.Type = obj.TYPE_REG - p.To.Reg = x + a := v.Op.Asm() + if x < x86.REG_X0 { // not an FP register + if v.AuxInt == 0 && v.Aux == nil { + opregreg(s, x86.AXORL, x, x) + break + } + c := v.AuxInt + switch v.Type.Size() { + case 4: + a = x86.AMOVL + c = int64(math.Float32bits(float32(math.Float64frombits(uint64(v.AuxInt))))) + case 8: + a = x86.AMOVQ + default: + panic(fmt.Sprintf("unexpected type width for float const into non-float register, %v", v)) + } + p := s.Prog(a) + p.From.Type = obj.TYPE_CONST + p.From.Offset = c + p.To.Type = obj.TYPE_REG + p.To.Reg = x + } else { + p := s.Prog(a) + p.From.Type = obj.TYPE_FCONST + p.From.Val = math.Float64frombits(uint64(v.AuxInt)) + p.To.Type = obj.TYPE_REG + p.To.Reg = x + } case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload: @@ -1134,7 +1164,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { y = simdOrMaskReg(v) } if x != y { - opregreg(s, moveByType(v.Type), y, x) + opregreg(s, moveByType(v.Args[0], v), y, x) } case ssa.OpLoadReg: if v.Type.IsFlags() { diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go index c06f76fe9f..1ba8350803 100644 --- a/src/cmd/compile/internal/inline/inl.go +++ b/src/cmd/compile/internal/inline/inl.go @@ -202,6 +202,7 @@ func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose boo // be very liberal here, if the closure is only called once, the budget is large budget = max(budget, inlineClosureCalledOnceCost) } + return budget } @@ -263,6 +264,7 @@ func CanInline(fn *ir.Func, profile *pgoir.Profile) { visitor := hairyVisitor{ curFunc: fn, + debug: isDebugFn(fn), isBigFunc: IsBigFunc(fn), budget: budget, maxBudget: budget, @@ -407,6 +409,7 @@ type hairyVisitor struct { // This is needed to access the current caller in the doNode function. curFunc *ir.Func isBigFunc bool + debug bool budget int32 maxBudget int32 reason string @@ -416,6 +419,16 @@ type hairyVisitor struct { profile *pgoir.Profile } +func isDebugFn(fn *ir.Func) bool { + // if n := fn.Nname; n != nil && n.Sym().Pkg.Path == "0" { + // if n.Sym().Name == "BroadcastInt64x4" { + // fmt.Printf("isDebugFn '%s' DOT '%s'\n", n.Sym().Pkg.Path, n.Sym().Name) + // return true + // } + // } + return false +} + func (v *hairyVisitor) tooHairy(fn *ir.Func) bool { v.do = v.doNode // cache closure if ir.DoChildren(fn, v.do) { @@ -434,6 +447,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool { if n == nil { return false } + if v.debug { + fmt.Printf("%v: doNode %v budget is %d\n", ir.Line(n), n.Op(), v.budget) + } opSwitch: switch n.Op() { // Call is okay if inlinable and we have the budget for the body. @@ -551,12 +567,19 @@ opSwitch: } if cheap { + if v.debug { + if ir.IsIntrinsicCall(n) { + fmt.Printf("%v: cheap call is also intrinsic, %v\n", ir.Line(n), n) + } + } break // treat like any other node, that is, cost of 1 } if ir.IsIntrinsicCall(n) { - // Treat like any other node. - break + if v.debug { + fmt.Printf("%v: intrinsic call, %v\n", ir.Line(n), n) + } + break // Treat like any other node. } if callee := inlCallee(v.curFunc, n.Fun, v.profile, false); callee != nil && typecheck.HaveInlineBody(callee) { @@ -583,6 +606,10 @@ opSwitch: } } + if v.debug { + fmt.Printf("%v: costly OCALLFUNC %v\n", ir.Line(n), n) + } + // Call cost for non-leaf inlining. v.budget -= extraCost @@ -592,6 +619,9 @@ opSwitch: // Things that are too hairy, irrespective of the budget case ir.OCALL, ir.OCALLINTER: // Call cost for non-leaf inlining. + if v.debug { + fmt.Printf("%v: costly OCALL %v\n", ir.Line(n), n) + } v.budget -= v.extraCallCost case ir.OPANIC: @@ -743,7 +773,7 @@ opSwitch: v.budget-- // When debugging, don't stop early, to get full cost of inlining this function - if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() { + if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() && !v.debug { v.reason = "too expensive" return true } diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index ee03075f52..f5b5b9bb7c 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1913,6 +1913,13 @@ func IsIntrinsicCall(n *ir.CallExpr) bool { } name, ok := n.Fun.(*ir.Name) if !ok { + if n.Fun.Op() == ir.OMETHEXPR { + if meth := ir.MethodExprName(n.Fun); meth != nil { + if fn := meth.Func; fn != nil { + return IsIntrinsicSym(fn.Sym()) + } + } + } return false } return IsIntrinsicSym(name.Sym()) -- 2.52.0