]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: intrinsify math.{Trunc/Ceil/Floor} on amd64
authorIlya Tocar <ilya.tocar@intel.com>
Thu, 5 Oct 2017 20:45:46 +0000 (15:45 -0500)
committerIlya Tocar <ilya.tocar@intel.com>
Tue, 31 Oct 2017 19:30:54 +0000 (19:30 +0000)
This significantly speed-ups Trunc.
Ceil/Floor are using the same instruction, so do them too.

name     old time/op  new time/op  delta
Floor-6  3.33ns ± 1%  3.22ns ± 0%   -3.39%  (p=0.000 n=10+10)
Ceil-6   3.33ns ± 1%  3.22ns ± 0%   -3.16%  (p=0.000 n=10+7)
Trunc-6  4.83ns ± 0%  3.22ns ± 0%  -33.36%  (p=0.000 n=6+8)

Change-Id: If848790e458eedfe38a6a0407bb4f589c68ac254
Reviewed-on: https://go-review.googlesource.com/68630
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/gc/builtin.go
src/cmd/compile/internal/gc/builtin/runtime.go
src/cmd/compile/internal/gc/ssa.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/gen/AMD64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/math/floor_amd64.s

index c116336c7fe353c19f8dce722862cc03686db760..5bf8f0e4d8521a3e8a28b0cd1a2fb5508566cf58 100644 (file)
@@ -852,6 +852,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.From.Reg = v.Args[0].Reg()
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg()
+       case ssa.OpAMD64ROUNDSD:
+               p := s.Prog(v.Op.Asm())
+               val := v.AuxInt
+               // 1 means math.Floor, 2 Ceil, 3 Trunc
+               if val != 1 && val != 2 && val != 3 {
+                       v.Fatalf("Invalid rounding mode")
+               }
+               p.From.Offset = val
+               p.From.Type = obj.TYPE_CONST
+               p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = v.Reg()
        case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
                if v.Args[0].Reg() != v.Reg() {
                        // POPCNT on Intel has a false dependency on the destination register.
index 79e1a51a4bedd5fee007c459d81a64f718543752..a72b36b1fded5d79b4693cf60b2ce6507c4a3803 100644 (file)
@@ -146,6 +146,7 @@ var runtimeDecls = [...]struct {
        {"msanread", funcTag, 112},
        {"msanwrite", funcTag, 112},
        {"support_popcnt", varTag, 11},
+       {"support_sse41", varTag, 11},
 }
 
 func runtimeTypes() []*types.Type {
index 03b853ecc50b0a73878da08aadb8585e96d50a8d..5220e251fa67d7cc3b1e3a8c7eeb73354090f1e4 100644 (file)
@@ -191,3 +191,4 @@ func msanwrite(addr, size uintptr)
 
 // architecture variants
 var support_popcnt bool
+var support_sse41 bool
index 233c639ba08a118ac78ad68f9cf931bac5247318..34c74a281b1d21f0c609aaf374ef03ae588e81b9 100644 (file)
@@ -2823,6 +2823,47 @@ func init() {
                },
                sys.PPC64)
 
+       makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+               return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+                       aux := syslook("support_sse41").Sym.Linksym()
+                       addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), aux, s.sb)
+                       v := s.newValue2(ssa.OpLoad, types.Types[TBOOL], addr, s.mem())
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       s.vars[n] = s.newValue1(op, types.Types[TFLOAT64], args[0])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       a := s.call(n, callNormal)
+                       s.vars[n] = s.newValue2(ssa.OpLoad, types.Types[TFLOAT64], a, s.mem())
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, types.Types[TFLOAT64])
+               }
+       }
+       addF("math", "Floor",
+               makeRoundAMD64(ssa.OpFloor),
+               sys.AMD64)
+       addF("math", "Ceil",
+               makeRoundAMD64(ssa.OpCeil),
+               sys.AMD64)
+       addF("math", "Trunc",
+               makeRoundAMD64(ssa.OpTrunc),
+               sys.AMD64)
+
        /******** math/bits ********/
        addF("math/bits", "TrailingZeros64",
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
index c0d9dda386760648442c6b9b2915abf7837157be..d26cdfba56fab7fe7ceaff07f1dbcbc009205de2 100644 (file)
 
 (Sqrt x) -> (SQRTSD x)
 
+(Floor x) -> (ROUNDSD [1] x)
+(Ceil x)  -> (ROUNDSD [2] x)
+(Trunc x) -> (ROUNDSD [3] x)
+
 // Lowering extension
 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
 (SignExt8to16  x) -> (MOVBQSX x)
index a15d3e4519905babc90d7e09b7fe739890afa990..0c3b2efa305d7f11a703fb8f7c98b940bc371147 100644 (file)
@@ -341,6 +341,10 @@ func init() {
 
                {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
 
+               // ROUNDSD instruction isn't guaranteed to be on the target platform (it is SSE4.1)
+               // Any use must be preceded by a successful check of runtime.support_sse41.
+               {name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc
+
                {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
                {name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
                // Note: SBBW and SBBB are subsumed by SBBL
index 9d44e1ab84c46019e79fc112bb9615dfca496888..8d1a70f654a042f48a89a1f402bdafe786f87f79 100644 (file)
@@ -557,6 +557,7 @@ const (
        OpAMD64POPCNTQ
        OpAMD64POPCNTL
        OpAMD64SQRTSD
+       OpAMD64ROUNDSD
        OpAMD64SBBQcarrymask
        OpAMD64SBBLcarrymask
        OpAMD64SETEQ
@@ -6716,6 +6717,20 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:    "ROUNDSD",
+               auxType: auxInt8,
+               argLen:  1,
+               asm:     x86.AROUNDSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                       },
+                       outputs: []outputInfo{
+                               {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                       },
+               },
+       },
        {
                name:   "SBBQcarrymask",
                argLen: 1,
index df49aa5df7d1ef8dccc4ae426e3364fe9926b775..e6f574b6c114d29d7b4eba2b37491db468c8680c 100644 (file)
@@ -473,6 +473,8 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpBswap32_0(v)
        case OpBswap64:
                return rewriteValueAMD64_OpBswap64_0(v)
+       case OpCeil:
+               return rewriteValueAMD64_OpCeil_0(v)
        case OpClosureCall:
                return rewriteValueAMD64_OpClosureCall_0(v)
        case OpCom16:
@@ -563,6 +565,8 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpEqB_0(v)
        case OpEqPtr:
                return rewriteValueAMD64_OpEqPtr_0(v)
+       case OpFloor:
+               return rewriteValueAMD64_OpFloor_0(v)
        case OpGeq16:
                return rewriteValueAMD64_OpGeq16_0(v)
        case OpGeq16U:
@@ -893,6 +897,8 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpSub8_0(v)
        case OpSubPtr:
                return rewriteValueAMD64_OpSubPtr_0(v)
+       case OpTrunc:
+               return rewriteValueAMD64_OpTrunc_0(v)
        case OpTrunc16to8:
                return rewriteValueAMD64_OpTrunc16to8_0(v)
        case OpTrunc32to16:
@@ -42210,6 +42216,18 @@ func rewriteValueAMD64_OpBswap64_0(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpCeil_0(v *Value) bool {
+       // match: (Ceil x)
+       // cond:
+       // result: (ROUNDSD [2] x)
+       for {
+               x := v.Args[0]
+               v.reset(OpAMD64ROUNDSD)
+               v.AuxInt = 2
+               v.AddArg(x)
+               return true
+       }
+}
 func rewriteValueAMD64_OpClosureCall_0(v *Value) bool {
        // match: (ClosureCall [argwid] entry closure mem)
        // cond:
@@ -42958,6 +42976,18 @@ func rewriteValueAMD64_OpEqPtr_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpFloor_0(v *Value) bool {
+       // match: (Floor x)
+       // cond:
+       // result: (ROUNDSD [1] x)
+       for {
+               x := v.Args[0]
+               v.reset(OpAMD64ROUNDSD)
+               v.AuxInt = 1
+               v.AddArg(x)
+               return true
+       }
+}
 func rewriteValueAMD64_OpGeq16_0(v *Value) bool {
        b := v.Block
        _ = b
@@ -46885,6 +46915,18 @@ func rewriteValueAMD64_OpSubPtr_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpTrunc_0(v *Value) bool {
+       // match: (Trunc x)
+       // cond:
+       // result: (ROUNDSD [3] x)
+       for {
+               x := v.Args[0]
+               v.reset(OpAMD64ROUNDSD)
+               v.AuxInt = 3
+               v.AddArg(x)
+               return true
+       }
+}
 func rewriteValueAMD64_OpTrunc16to8_0(v *Value) bool {
        // match: (Trunc16to8 x)
        // cond:
index 678d64361c4885b145039a9df1674aa4082a9925..4ef02eb09aaafdb55717bf84dc9ca5723fdd23f7 100644 (file)
@@ -8,12 +8,6 @@
 
 // func Floor(x float64) float64
 TEXT ·Floor(SB),NOSPLIT,$0
-       CMPB    ·useSSE41(SB), $1
-       JNE     nosse4
-       ROUNDSD $1, x+0(FP), X0
-       MOVQ X0, ret+8(FP)
-       RET
-nosse4:
        MOVQ    x+0(FP), AX
        MOVQ    $~(1<<63), DX // sign bit mask
        ANDQ    AX,DX // DX = |x|
@@ -36,12 +30,6 @@ isBig_floor:
 
 // func Ceil(x float64) float64
 TEXT ·Ceil(SB),NOSPLIT,$0
-       CMPB    ·useSSE41(SB), $1
-       JNE     nosse4
-       ROUNDSD $2, x+0(FP), X0
-       MOVQ X0, ret+8(FP)
-       RET
-nosse4:
        MOVQ    x+0(FP), AX
        MOVQ    $~(1<<63), DX // sign bit mask
        MOVQ    AX, BX // BX = copy of x