}
}
+func isFPReg(r int16) bool {
+ return x86.REG_X0 <= r && r <= x86.REG_Z31
+}
+
// loadByType returns the load instruction of the given type.
func loadByType(t *types.Type) obj.As {
// Avoid partial register write
}
// moveByType returns the reg->reg move instruction of the given type.
-func moveByType(t *types.Type) obj.As {
- if t.IsFloat() {
+func moveByType(from, to *ssa.Value) obj.As {
+ toT := to.Type
+ fromR, toR := from.Reg(), to.Reg()
+ if isFPReg(fromR) && isFPReg(toR) && toT.IsFloat() {
// Moving the whole sse2 register is faster
// than moving just the correct low portion of it.
// There is no xmm->xmm move with 1 byte opcode,
// so use movups, which has 2 byte opcode.
return x86.AMOVUPS
- } else if t.IsSIMD() {
- return simdMov(t.Size())
- } else {
- switch t.Size() {
- case 1:
- // Avoids partial register write
- return x86.AMOVL
- case 2:
- return x86.AMOVL
- case 4:
- return x86.AMOVL
- case 8:
- return x86.AMOVQ
- case 16:
- return x86.AMOVUPS // int128s are in SSE registers
- default:
- panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
- }
+ }
+ if toT.IsSIMD() {
+ return simdMov(toT.Size())
+ }
+ switch toT.Size() {
+ case 1:
+ // Avoids partial register write
+ return x86.AMOVL
+ case 2:
+ return x86.AMOVL
+ case 4:
+ return x86.AMOVL
+ case 8:
+ return x86.AMOVQ
+ case 16:
+ return x86.AMOVUPS // int128s are in SSE registers
+ default:
+ panic(fmt.Sprintf("bad int register width %d:%v", toT.Size(), toT))
}
}
// But this requires a way for regalloc to know that SRC might be
// clobbered by this instruction.
t := v.RegTmp()
- opregreg(s, moveByType(v.Type), t, v.Args[1].Reg())
+ opregreg(s, moveByType(v.Args[1], v), t, v.Args[1].Reg())
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = x
+
case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
x := v.Reg()
- p := s.Prog(v.Op.Asm())
- p.From.Type = obj.TYPE_FCONST
- p.From.Val = math.Float64frombits(uint64(v.AuxInt))
- p.To.Type = obj.TYPE_REG
- p.To.Reg = x
+ a := v.Op.Asm()
+ if x < x86.REG_X0 { // not an FP register
+ if v.AuxInt == 0 && v.Aux == nil {
+ opregreg(s, x86.AXORL, x, x)
+ break
+ }
+ c := v.AuxInt
+ switch v.Type.Size() {
+ case 4:
+ a = x86.AMOVL
+ c = int64(math.Float32bits(float32(math.Float64frombits(uint64(v.AuxInt)))))
+ case 8:
+ a = x86.AMOVQ
+ default:
+ panic(fmt.Sprintf("unexpected type width for float const into non-float register, %v", v))
+ }
+ p := s.Prog(a)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = c
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = x
+ } else {
+ p := s.Prog(a)
+ p.From.Type = obj.TYPE_FCONST
+ p.From.Val = math.Float64frombits(uint64(v.AuxInt))
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = x
+ }
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
y = simdOrMaskReg(v)
}
if x != y {
- opregreg(s, moveByType(v.Type), y, x)
+ opregreg(s, moveByType(v.Args[0], v), y, x)
}
case ssa.OpLoadReg:
if v.Type.IsFlags() {
// be very liberal here, if the closure is only called once, the budget is large
budget = max(budget, inlineClosureCalledOnceCost)
}
+
return budget
}
visitor := hairyVisitor{
curFunc: fn,
+ debug: isDebugFn(fn),
isBigFunc: IsBigFunc(fn),
budget: budget,
maxBudget: budget,
// This is needed to access the current caller in the doNode function.
curFunc *ir.Func
isBigFunc bool
+ debug bool
budget int32
maxBudget int32
reason string
profile *pgoir.Profile
}
+func isDebugFn(fn *ir.Func) bool {
+ // if n := fn.Nname; n != nil && n.Sym().Pkg.Path == "0" {
+ // if n.Sym().Name == "BroadcastInt64x4" {
+ // fmt.Printf("isDebugFn '%s' DOT '%s'\n", n.Sym().Pkg.Path, n.Sym().Name)
+ // return true
+ // }
+ // }
+ return false
+}
+
func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
v.do = v.doNode // cache closure
if ir.DoChildren(fn, v.do) {
if n == nil {
return false
}
+ if v.debug {
+ fmt.Printf("%v: doNode %v budget is %d\n", ir.Line(n), n.Op(), v.budget)
+ }
opSwitch:
switch n.Op() {
// Call is okay if inlinable and we have the budget for the body.
}
if cheap {
+ if v.debug {
+ if ir.IsIntrinsicCall(n) {
+ fmt.Printf("%v: cheap call is also intrinsic, %v\n", ir.Line(n), n)
+ }
+ }
break // treat like any other node, that is, cost of 1
}
if ir.IsIntrinsicCall(n) {
- // Treat like any other node.
- break
+ if v.debug {
+ fmt.Printf("%v: intrinsic call, %v\n", ir.Line(n), n)
+ }
+ break // Treat like any other node.
}
if callee := inlCallee(v.curFunc, n.Fun, v.profile, false); callee != nil && typecheck.HaveInlineBody(callee) {
}
}
+ if v.debug {
+ fmt.Printf("%v: costly OCALLFUNC %v\n", ir.Line(n), n)
+ }
+
// Call cost for non-leaf inlining.
v.budget -= extraCost
// Things that are too hairy, irrespective of the budget
case ir.OCALL, ir.OCALLINTER:
// Call cost for non-leaf inlining.
+ if v.debug {
+ fmt.Printf("%v: costly OCALL %v\n", ir.Line(n), n)
+ }
v.budget -= v.extraCallCost
case ir.OPANIC:
v.budget--
// When debugging, don't stop early, to get full cost of inlining this function
- if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() {
+ if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() && !v.debug {
v.reason = "too expensive"
return true
}