From 94484d8ed5155873306ab41f49b0b30be19031a9 Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Thu, 5 Oct 2017 15:45:46 -0500 Subject: [PATCH] cmd/compile: intrinsify math.{Trunc/Ceil/Floor} on amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This significantly speed-ups Trunc. Ceil/Floor are using the same instruction, so do them too. name old time/op new time/op delta Floor-6 3.33ns ± 1% 3.22ns ± 0% -3.39% (p=0.000 n=10+10) Ceil-6 3.33ns ± 1% 3.22ns ± 0% -3.16% (p=0.000 n=10+7) Trunc-6 4.83ns ± 0% 3.22ns ± 0% -33.36% (p=0.000 n=6+8) Change-Id: If848790e458eedfe38a6a0407bb4f589c68ac254 Reviewed-on: https://go-review.googlesource.com/68630 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/compile/internal/amd64/ssa.go | 12 ++++++ src/cmd/compile/internal/gc/builtin.go | 1 + .../compile/internal/gc/builtin/runtime.go | 1 + src/cmd/compile/internal/gc/ssa.go | 41 ++++++++++++++++++ src/cmd/compile/internal/ssa/gen/AMD64.rules | 4 ++ src/cmd/compile/internal/ssa/gen/AMD64Ops.go | 4 ++ src/cmd/compile/internal/ssa/opGen.go | 15 +++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 42 +++++++++++++++++++ src/math/floor_amd64.s | 12 ------ 9 files changed, 120 insertions(+), 12 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index c116336c7f..5bf8f0e4d8 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -852,6 +852,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.From.Reg = v.Args[0].Reg() p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpAMD64ROUNDSD: + p := s.Prog(v.Op.Asm()) + val := v.AuxInt + // 1 means math.Floor, 2 Ceil, 3 Trunc + if val != 1 && val != 2 && val != 3 { + v.Fatalf("Invalid rounding mode") + } + p.From.Offset = val + p.From.Type = obj.TYPE_CONST + p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()}) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL: if v.Args[0].Reg() != v.Reg() { // POPCNT on Intel has a false dependency on the destination register. diff --git a/src/cmd/compile/internal/gc/builtin.go b/src/cmd/compile/internal/gc/builtin.go index 79e1a51a4b..a72b36b1fd 100644 --- a/src/cmd/compile/internal/gc/builtin.go +++ b/src/cmd/compile/internal/gc/builtin.go @@ -146,6 +146,7 @@ var runtimeDecls = [...]struct { {"msanread", funcTag, 112}, {"msanwrite", funcTag, 112}, {"support_popcnt", varTag, 11}, + {"support_sse41", varTag, 11}, } func runtimeTypes() []*types.Type { diff --git a/src/cmd/compile/internal/gc/builtin/runtime.go b/src/cmd/compile/internal/gc/builtin/runtime.go index 03b853ecc5..5220e251fa 100644 --- a/src/cmd/compile/internal/gc/builtin/runtime.go +++ b/src/cmd/compile/internal/gc/builtin/runtime.go @@ -191,3 +191,4 @@ func msanwrite(addr, size uintptr) // architecture variants var support_popcnt bool +var support_sse41 bool diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 233c639ba0..34c74a281b 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -2823,6 +2823,47 @@ func init() { }, sys.PPC64) + makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + return func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + aux := syslook("support_sse41").Sym.Linksym() + addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), aux, s.sb) + v := s.newValue2(ssa.OpLoad, types.Types[TBOOL], addr, s.mem()) + b := s.endBlock() + b.Kind = ssa.BlockIf + b.SetControl(v) + bTrue := s.f.NewBlock(ssa.BlockPlain) + bFalse := s.f.NewBlock(ssa.BlockPlain) + bEnd := s.f.NewBlock(ssa.BlockPlain) + b.AddEdgeTo(bTrue) + b.AddEdgeTo(bFalse) + b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays + + // We have the intrinsic - use it directly. + s.startBlock(bTrue) + s.vars[n] = s.newValue1(op, types.Types[TFLOAT64], args[0]) + s.endBlock().AddEdgeTo(bEnd) + + // Call the pure Go version. + s.startBlock(bFalse) + a := s.call(n, callNormal) + s.vars[n] = s.newValue2(ssa.OpLoad, types.Types[TFLOAT64], a, s.mem()) + s.endBlock().AddEdgeTo(bEnd) + + // Merge results. + s.startBlock(bEnd) + return s.variable(n, types.Types[TFLOAT64]) + } + } + addF("math", "Floor", + makeRoundAMD64(ssa.OpFloor), + sys.AMD64) + addF("math", "Ceil", + makeRoundAMD64(ssa.OpCeil), + sys.AMD64) + addF("math", "Trunc", + makeRoundAMD64(ssa.OpTrunc), + sys.AMD64) + /******** math/bits ********/ addF("math/bits", "TrailingZeros64", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index c0d9dda386..d26cdfba56 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -113,6 +113,10 @@ (Sqrt x) -> (SQRTSD x) +(Floor x) -> (ROUNDSD [1] x) +(Ceil x) -> (ROUNDSD [2] x) +(Trunc x) -> (ROUNDSD [3] x) + // Lowering extension // Note: we always extend to 64 bits even though some ops don't need that many result bits. (SignExt8to16 x) -> (MOVBQSX x) diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go index a15d3e4519..0c3b2efa30 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go @@ -341,6 +341,10 @@ func init() { {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0) + // ROUNDSD instruction isn't guaranteed to be on the target platform (it is SSE4.1) + // Any use must be preceded by a successful check of runtime.support_sse41. + {name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc + {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear. {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear. // Note: SBBW and SBBB are subsumed by SBBL diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 9d44e1ab84..8d1a70f654 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -557,6 +557,7 @@ const ( OpAMD64POPCNTQ OpAMD64POPCNTL OpAMD64SQRTSD + OpAMD64ROUNDSD OpAMD64SBBQcarrymask OpAMD64SBBLcarrymask OpAMD64SETEQ @@ -6716,6 +6717,20 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "ROUNDSD", + auxType: auxInt8, + argLen: 1, + asm: x86.AROUNDSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + }, + }, { name: "SBBQcarrymask", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index df49aa5df7..e6f574b6c1 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -473,6 +473,8 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpBswap32_0(v) case OpBswap64: return rewriteValueAMD64_OpBswap64_0(v) + case OpCeil: + return rewriteValueAMD64_OpCeil_0(v) case OpClosureCall: return rewriteValueAMD64_OpClosureCall_0(v) case OpCom16: @@ -563,6 +565,8 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpEqB_0(v) case OpEqPtr: return rewriteValueAMD64_OpEqPtr_0(v) + case OpFloor: + return rewriteValueAMD64_OpFloor_0(v) case OpGeq16: return rewriteValueAMD64_OpGeq16_0(v) case OpGeq16U: @@ -893,6 +897,8 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSub8_0(v) case OpSubPtr: return rewriteValueAMD64_OpSubPtr_0(v) + case OpTrunc: + return rewriteValueAMD64_OpTrunc_0(v) case OpTrunc16to8: return rewriteValueAMD64_OpTrunc16to8_0(v) case OpTrunc32to16: @@ -42210,6 +42216,18 @@ func rewriteValueAMD64_OpBswap64_0(v *Value) bool { return true } } +func rewriteValueAMD64_OpCeil_0(v *Value) bool { + // match: (Ceil x) + // cond: + // result: (ROUNDSD [2] x) + for { + x := v.Args[0] + v.reset(OpAMD64ROUNDSD) + v.AuxInt = 2 + v.AddArg(x) + return true + } +} func rewriteValueAMD64_OpClosureCall_0(v *Value) bool { // match: (ClosureCall [argwid] entry closure mem) // cond: @@ -42958,6 +42976,18 @@ func rewriteValueAMD64_OpEqPtr_0(v *Value) bool { } return false } +func rewriteValueAMD64_OpFloor_0(v *Value) bool { + // match: (Floor x) + // cond: + // result: (ROUNDSD [1] x) + for { + x := v.Args[0] + v.reset(OpAMD64ROUNDSD) + v.AuxInt = 1 + v.AddArg(x) + return true + } +} func rewriteValueAMD64_OpGeq16_0(v *Value) bool { b := v.Block _ = b @@ -46885,6 +46915,18 @@ func rewriteValueAMD64_OpSubPtr_0(v *Value) bool { } return false } +func rewriteValueAMD64_OpTrunc_0(v *Value) bool { + // match: (Trunc x) + // cond: + // result: (ROUNDSD [3] x) + for { + x := v.Args[0] + v.reset(OpAMD64ROUNDSD) + v.AuxInt = 3 + v.AddArg(x) + return true + } +} func rewriteValueAMD64_OpTrunc16to8_0(v *Value) bool { // match: (Trunc16to8 x) // cond: diff --git a/src/math/floor_amd64.s b/src/math/floor_amd64.s index 678d64361c..4ef02eb09a 100644 --- a/src/math/floor_amd64.s +++ b/src/math/floor_amd64.s @@ -8,12 +8,6 @@ // func Floor(x float64) float64 TEXT ·Floor(SB),NOSPLIT,$0 - CMPB ·useSSE41(SB), $1 - JNE nosse4 - ROUNDSD $1, x+0(FP), X0 - MOVQ X0, ret+8(FP) - RET -nosse4: MOVQ x+0(FP), AX MOVQ $~(1<<63), DX // sign bit mask ANDQ AX,DX // DX = |x| @@ -36,12 +30,6 @@ isBig_floor: // func Ceil(x float64) float64 TEXT ·Ceil(SB),NOSPLIT,$0 - CMPB ·useSSE41(SB), $1 - JNE nosse4 - ROUNDSD $2, x+0(FP), X0 - MOVQ X0, ret+8(FP) - RET -nosse4: MOVQ x+0(FP), AX MOVQ $~(1<<63), DX // sign bit mask MOVQ AX, BX // BX = copy of x -- 2.48.1