[dev.simd] cmd/compile: fix isIntrinsic for methods; fix fp <-> gp moves

author David Chase <drchase@google.com>

Thu, 7 Aug 2025 20:44:50 +0000 (16:44 -0400)

committer David Chase <drchase@google.com>

Wed, 13 Aug 2025 18:48:08 +0000 (11:48 -0700)
author David Chase <drchase@google.com>
Thu, 7 Aug 2025 20:44:50 +0000 (16:44 -0400)
committer David Chase <drchase@google.com>
Wed, 13 Aug 2025 18:48:08 +0000 (11:48 -0700)
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go

index d3fae7ce14c8e6696857951116eac36db3a4b4e6..38815929d2520f2f70528eb1c7abf17eb1274ea2 100644 (file)
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -43,6 +43,10 @@ func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
         }
  }
  
+func isFPReg(r int16) bool {
+       return x86.REG_X0 <= r && r <= x86.REG_Z31
+}
+
  // loadByType returns the load instruction of the given type.
  func loadByType(t *types.Type) obj.As {
         // Avoid partial register write
@@ -88,31 +92,33 @@ func storeByType(t *types.Type) obj.As {
  }
  
  // moveByType returns the reg->reg move instruction of the given type.
-func moveByType(t *types.Type) obj.As {
-       if t.IsFloat() {
+func moveByType(from, to *ssa.Value) obj.As {
+       toT := to.Type
+       fromR, toR := from.Reg(), to.Reg()
+       if isFPReg(fromR) && isFPReg(toR) && toT.IsFloat() {
                 // Moving the whole sse2 register is faster
                 // than moving just the correct low portion of it.
                 // There is no xmm->xmm move with 1 byte opcode,
                 // so use movups, which has 2 byte opcode.
                 return x86.AMOVUPS
-       } else if t.IsSIMD() {
-               return simdMov(t.Size())
-       } else {
-               switch t.Size() {
-               case 1:
-                       // Avoids partial register write
-                       return x86.AMOVL
-               case 2:
-                       return x86.AMOVL
-               case 4:
-                       return x86.AMOVL
-               case 8:
-                       return x86.AMOVQ
-               case 16:
-                       return x86.AMOVUPS // int128s are in SSE registers
-               default:
-                       panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
-               }
+       }
+       if toT.IsSIMD() {
+               return simdMov(toT.Size())
+       }
+       switch toT.Size() {
+       case 1:
+               // Avoids partial register write
+               return x86.AMOVL
+       case 2:
+               return x86.AMOVL
+       case 4:
+               return x86.AMOVL
+       case 8:
+               return x86.AMOVQ
+       case 16:
+               return x86.AMOVUPS // int128s are in SSE registers
+       default:
+               panic(fmt.Sprintf("bad int register width %d:%v", toT.Size(), toT))
         }
  }
  
@@ -648,7 +654,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                 // But this requires a way for regalloc to know that SRC might be
                 // clobbered by this instruction.
                 t := v.RegTmp()
-               opregreg(s, moveByType(v.Type), t, v.Args[1].Reg())
+               opregreg(s, moveByType(v.Args[1], v), t, v.Args[1].Reg())
  
                 p := s.Prog(v.Op.Asm())
                 p.From.Type = obj.TYPE_REG
@@ -820,13 +826,37 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                 p.From.Offset = v.AuxInt
                 p.To.Type = obj.TYPE_REG
                 p.To.Reg = x
+
         case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
                 x := v.Reg()
-               p := s.Prog(v.Op.Asm())
-               p.From.Type = obj.TYPE_FCONST
-               p.From.Val = math.Float64frombits(uint64(v.AuxInt))
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = x
+               a := v.Op.Asm()
+               if x < x86.REG_X0 { // not an FP register
+                       if v.AuxInt == 0 && v.Aux == nil {
+                               opregreg(s, x86.AXORL, x, x)
+                               break
+                       }
+                       c := v.AuxInt
+                       switch v.Type.Size() {
+                       case 4:
+                               a = x86.AMOVL
+                               c = int64(math.Float32bits(float32(math.Float64frombits(uint64(v.AuxInt)))))
+                       case 8:
+                               a = x86.AMOVQ
+                       default:
+                               panic(fmt.Sprintf("unexpected type width for float const into non-float register, %v", v))
+                       }
+                       p := s.Prog(a)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = c
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = x
+               } else {
+                       p := s.Prog(a)
+                       p.From.Type = obj.TYPE_FCONST
+                       p.From.Val = math.Float64frombits(uint64(v.AuxInt))
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = x
+               }
         case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
                 ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
                 ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
@@ -1134,7 +1164,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                         y = simdOrMaskReg(v)
                 }
                 if x != y {
-                       opregreg(s, moveByType(v.Type), y, x)
+                       opregreg(s, moveByType(v.Args[0], v), y, x)
                 }
         case ssa.OpLoadReg:
                 if v.Type.IsFlags() {
diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go

index c06f76fe9ff029158ef84a102b61f297a7a71557..1ba8350803052ec2ba34d54d4fd4c1c5e80c3ed4 100644 (file)
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@@ -202,6 +202,7 @@ func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose boo
                 // be very liberal here, if the closure is only called once, the budget is large
                 budget = max(budget, inlineClosureCalledOnceCost)
         }
+
         return budget
  }
  
@@ -263,6 +264,7 @@ func CanInline(fn *ir.Func, profile *pgoir.Profile) {
  
         visitor := hairyVisitor{
                 curFunc:       fn,
+               debug:         isDebugFn(fn),
                 isBigFunc:     IsBigFunc(fn),
                 budget:        budget,
                 maxBudget:     budget,
@@ -407,6 +409,7 @@ type hairyVisitor struct {
         // This is needed to access the current caller in the doNode function.
         curFunc       *ir.Func
         isBigFunc     bool
+       debug         bool
         budget        int32
         maxBudget     int32
         reason        string
@@ -416,6 +419,16 @@ type hairyVisitor struct {
         profile       *pgoir.Profile
  }
  
+func isDebugFn(fn *ir.Func) bool {
+       // if n := fn.Nname; n != nil && n.Sym().Pkg.Path == "0" {
+       //      if n.Sym().Name == "BroadcastInt64x4" {
+       //              fmt.Printf("isDebugFn '%s' DOT '%s'\n", n.Sym().Pkg.Path, n.Sym().Name)
+       //              return true
+       //      }
+       // }
+       return false
+}
+
  func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
         v.do = v.doNode // cache closure
         if ir.DoChildren(fn, v.do) {
@@ -434,6 +447,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
         if n == nil {
                 return false
         }
+       if v.debug {
+               fmt.Printf("%v: doNode %v budget is %d\n", ir.Line(n), n.Op(), v.budget)
+       }
  opSwitch:
         switch n.Op() {
         // Call is okay if inlinable and we have the budget for the body.
@@ -551,12 +567,19 @@ opSwitch:
                 }
  
                 if cheap {
+                       if v.debug {
+                               if ir.IsIntrinsicCall(n) {
+                                       fmt.Printf("%v: cheap call is also intrinsic, %v\n", ir.Line(n), n)
+                               }
+                       }
                         break // treat like any other node, that is, cost of 1
                 }
  
                 if ir.IsIntrinsicCall(n) {
-                       // Treat like any other node.
-                       break
+                       if v.debug {
+                               fmt.Printf("%v: intrinsic call, %v\n", ir.Line(n), n)
+                       }
+                       break // Treat like any other node.
                 }
  
                 if callee := inlCallee(v.curFunc, n.Fun, v.profile, false); callee != nil && typecheck.HaveInlineBody(callee) {
@@ -583,6 +606,10 @@ opSwitch:
                         }
                 }
  
+               if v.debug {
+                       fmt.Printf("%v: costly OCALLFUNC %v\n", ir.Line(n), n)
+               }
+
                 // Call cost for non-leaf inlining.
                 v.budget -= extraCost
  
@@ -592,6 +619,9 @@ opSwitch:
         // Things that are too hairy, irrespective of the budget
         case ir.OCALL, ir.OCALLINTER:
                 // Call cost for non-leaf inlining.
+               if v.debug {
+                       fmt.Printf("%v: costly OCALL %v\n", ir.Line(n), n)
+               }
                 v.budget -= v.extraCallCost
  
         case ir.OPANIC:
@@ -743,7 +773,7 @@ opSwitch:
         v.budget--
  
         // When debugging, don't stop early, to get full cost of inlining this function
-       if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() {
+       if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() && !v.debug {
                 v.reason = "too expensive"
                 return true
         }
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go

index ee03075f524af8fcf5d21315de65a1c72ce51010..f5b5b9bb7cd82874e57f27a2f038cbaf4f781d61 100644 (file)
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1913,6 +1913,13 @@ func IsIntrinsicCall(n *ir.CallExpr) bool {
         }
         name, ok := n.Fun.(*ir.Name)
         if !ok {
+               if n.Fun.Op() == ir.OMETHEXPR {
+                       if meth := ir.MethodExprName(n.Fun); meth != nil {
+                               if fn := meth.Func; fn != nil {
+                                       return IsIntrinsicSym(fn.Sym())
+                               }
+                       }
+               }
                 return false
         }
         return IsIntrinsicSym(name.Sym())
author	David Chase <drchase@google.com>
	Thu, 7 Aug 2025 20:44:50 +0000 (16:44 -0400)
committer	David Chase <drchase@google.com>
	Wed, 13 Aug 2025 18:48:08 +0000 (11:48 -0700)
src/cmd/compile/internal/amd64/ssa.go		patch \| blob \| history
src/cmd/compile/internal/inline/inl.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/intrinsics.go		patch \| blob \| history