This significantly speed-ups Trunc.
Ceil/Floor are using the same instruction, so do them too.
name old time/op new time/op delta
Floor-6 3.33ns ± 1% 3.22ns ± 0% -3.39% (p=0.000 n=10+10)
Ceil-6 3.33ns ± 1% 3.22ns ± 0% -3.16% (p=0.000 n=10+7)
Trunc-6 4.83ns ± 0% 3.22ns ± 0% -33.36% (p=0.000 n=6+8)
Change-Id: If848790e458eedfe38a6a0407bb4f589c68ac254
Reviewed-on: https://go-review.googlesource.com/68630
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
+ case ssa.OpAMD64ROUNDSD:
+ p := s.Prog(v.Op.Asm())
+ val := v.AuxInt
+ // 1 means math.Floor, 2 Ceil, 3 Trunc
+ if val != 1 && val != 2 && val != 3 {
+ v.Fatalf("Invalid rounding mode")
+ }
+ p.From.Offset = val
+ p.From.Type = obj.TYPE_CONST
+ p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Reg()
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
if v.Args[0].Reg() != v.Reg() {
// POPCNT on Intel has a false dependency on the destination register.
{"msanread", funcTag, 112},
{"msanwrite", funcTag, 112},
{"support_popcnt", varTag, 11},
+ {"support_sse41", varTag, 11},
}
func runtimeTypes() []*types.Type {
// architecture variants
var support_popcnt bool
+var support_sse41 bool
},
sys.PPC64)
+ makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ aux := syslook("support_sse41").Sym.Linksym()
+ addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), aux, s.sb)
+ v := s.newValue2(ssa.OpLoad, types.Types[TBOOL], addr, s.mem())
+ b := s.endBlock()
+ b.Kind = ssa.BlockIf
+ b.SetControl(v)
+ bTrue := s.f.NewBlock(ssa.BlockPlain)
+ bFalse := s.f.NewBlock(ssa.BlockPlain)
+ bEnd := s.f.NewBlock(ssa.BlockPlain)
+ b.AddEdgeTo(bTrue)
+ b.AddEdgeTo(bFalse)
+ b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
+
+ // We have the intrinsic - use it directly.
+ s.startBlock(bTrue)
+ s.vars[n] = s.newValue1(op, types.Types[TFLOAT64], args[0])
+ s.endBlock().AddEdgeTo(bEnd)
+
+ // Call the pure Go version.
+ s.startBlock(bFalse)
+ a := s.call(n, callNormal)
+ s.vars[n] = s.newValue2(ssa.OpLoad, types.Types[TFLOAT64], a, s.mem())
+ s.endBlock().AddEdgeTo(bEnd)
+
+ // Merge results.
+ s.startBlock(bEnd)
+ return s.variable(n, types.Types[TFLOAT64])
+ }
+ }
+ addF("math", "Floor",
+ makeRoundAMD64(ssa.OpFloor),
+ sys.AMD64)
+ addF("math", "Ceil",
+ makeRoundAMD64(ssa.OpCeil),
+ sys.AMD64)
+ addF("math", "Trunc",
+ makeRoundAMD64(ssa.OpTrunc),
+ sys.AMD64)
+
/******** math/bits ********/
addF("math/bits", "TrailingZeros64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
(Sqrt x) -> (SQRTSD x)
+(Floor x) -> (ROUNDSD [1] x)
+(Ceil x) -> (ROUNDSD [2] x)
+(Trunc x) -> (ROUNDSD [3] x)
+
// Lowering extension
// Note: we always extend to 64 bits even though some ops don't need that many result bits.
(SignExt8to16 x) -> (MOVBQSX x)
{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
+ // ROUNDSD instruction isn't guaranteed to be on the target platform (it is SSE4.1)
+ // Any use must be preceded by a successful check of runtime.support_sse41.
+ {name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc
+
{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
// Note: SBBW and SBBB are subsumed by SBBL
OpAMD64POPCNTQ
OpAMD64POPCNTL
OpAMD64SQRTSD
+ OpAMD64ROUNDSD
OpAMD64SBBQcarrymask
OpAMD64SBBLcarrymask
OpAMD64SETEQ
},
},
},
+ {
+ name: "ROUNDSD",
+ auxType: auxInt8,
+ argLen: 1,
+ asm: x86.AROUNDSD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+ },
+ outputs: []outputInfo{
+ {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+ },
+ },
+ },
{
name: "SBBQcarrymask",
argLen: 1,
return rewriteValueAMD64_OpBswap32_0(v)
case OpBswap64:
return rewriteValueAMD64_OpBswap64_0(v)
+ case OpCeil:
+ return rewriteValueAMD64_OpCeil_0(v)
case OpClosureCall:
return rewriteValueAMD64_OpClosureCall_0(v)
case OpCom16:
return rewriteValueAMD64_OpEqB_0(v)
case OpEqPtr:
return rewriteValueAMD64_OpEqPtr_0(v)
+ case OpFloor:
+ return rewriteValueAMD64_OpFloor_0(v)
case OpGeq16:
return rewriteValueAMD64_OpGeq16_0(v)
case OpGeq16U:
return rewriteValueAMD64_OpSub8_0(v)
case OpSubPtr:
return rewriteValueAMD64_OpSubPtr_0(v)
+ case OpTrunc:
+ return rewriteValueAMD64_OpTrunc_0(v)
case OpTrunc16to8:
return rewriteValueAMD64_OpTrunc16to8_0(v)
case OpTrunc32to16:
return true
}
}
+func rewriteValueAMD64_OpCeil_0(v *Value) bool {
+ // match: (Ceil x)
+ // cond:
+ // result: (ROUNDSD [2] x)
+ for {
+ x := v.Args[0]
+ v.reset(OpAMD64ROUNDSD)
+ v.AuxInt = 2
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueAMD64_OpClosureCall_0(v *Value) bool {
// match: (ClosureCall [argwid] entry closure mem)
// cond:
}
return false
}
+func rewriteValueAMD64_OpFloor_0(v *Value) bool {
+ // match: (Floor x)
+ // cond:
+ // result: (ROUNDSD [1] x)
+ for {
+ x := v.Args[0]
+ v.reset(OpAMD64ROUNDSD)
+ v.AuxInt = 1
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueAMD64_OpGeq16_0(v *Value) bool {
b := v.Block
_ = b
}
return false
}
+func rewriteValueAMD64_OpTrunc_0(v *Value) bool {
+ // match: (Trunc x)
+ // cond:
+ // result: (ROUNDSD [3] x)
+ for {
+ x := v.Args[0]
+ v.reset(OpAMD64ROUNDSD)
+ v.AuxInt = 3
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueAMD64_OpTrunc16to8_0(v *Value) bool {
// match: (Trunc16to8 x)
// cond:
// func Floor(x float64) float64
TEXT ·Floor(SB),NOSPLIT,$0
- CMPB ·useSSE41(SB), $1
- JNE nosse4
- ROUNDSD $1, x+0(FP), X0
- MOVQ X0, ret+8(FP)
- RET
-nosse4:
MOVQ x+0(FP), AX
MOVQ $~(1<<63), DX // sign bit mask
ANDQ AX,DX // DX = |x|
// func Ceil(x float64) float64
TEXT ·Ceil(SB),NOSPLIT,$0
- CMPB ·useSSE41(SB), $1
- JNE nosse4
- ROUNDSD $2, x+0(FP), X0
- MOVQ X0, ret+8(FP)
- RET
-nosse4:
MOVQ x+0(FP), AX
MOVQ $~(1<<63), DX // sign bit mask
MOVQ AX, BX // BX = copy of x